# 第8章: 機械学習

### 70. データの入手・整形

In [1]:
import random

pol_dirs = {
    "+1": "Datasets/rt-polaritydata/rt-polaritydata/rt-polarity.pos",
    "-1": "Datasets/rt-polaritydata/rt-polaritydata/rt-polarity.neg",
}

text = []
for polarity in pol_dirs.keys():
    with open(pol_dirs[polarity], errors = "ignore") as fr:
        for line in fr:
            line = line.rstrip()
            line = polarity + " " + line
            text.append(line)
random.shuffle(text)

with open("Outputs/sentiment.txt", mode="w") as fw:
    for sentence in text:
        print(sentence, file=fw)

### 71. ストップワード

In [2]:
#scikit-learnのストップワードを利用
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stopWord = list(ENGLISH_STOP_WORDS)
def isStopWord(word):
    if word in stopWord:
        return True
    elif word in [",", ".", ":", ";", "[", "]", "'", '"']:
        return True
    else:
        return False

### 72. 素性抽出

In [4]:
import spacy
en_nlp = spacy.load("en")

#データ(Datas)の整形: 極性(sentiment)と本文(sentence)を分割した辞書作成
Datas = []
with open("Outputs/sentiment.txt") as sentiText:
    for line in sentiText:
        line = line.rstrip()
        line = {"sentiment": line[:2], "sentence": line[3:]}
        Datas.append(line)

#Datasのsentenceを辞書引きトークン分割+ストップワード削除
for pairs in Datas:
    words = []
    doc_spacy = en_nlp(pairs["sentence"])
    for token in doc_spacy:
        token = str(token)
        if isStopWord(token):
            continue
        else:
            words.append(token)
    pairs["sentence"] = words

In [5]:
#全単語情報のリストアップ
import numpy as np
AllWords = []
for pairs in Datas:
    AllWords = AllWords + pairs["sentence"]
AllWords = list(set(AllWords))

In [6]:
#各文書に対する単語出現数値を疎行列化
from scipy.sparse import lil_matrix
X = lil_matrix((len(Datas), len(AllWords)))

for i, pairs in enumerate(Datas):
    for j, word in enumerate(AllWords):
        if word in pairs["sentence"]:
            X[i, j] = pairs["sentence"].count(word)

In [7]:
#y(クラスラベル)データの整形
y = []
for pairs in Datas:
    if pairs["sentiment"] == "+1":
        y.append(1)
    else:
        y.append(0)
y = np.array(y)

In [8]:
%store X
%store y

Stored 'X' (lil_matrix)
Stored 'y' (ndarray)


In [None]:
%store -r X
%store -r y

### 73. 学習

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 74. 予測

In [10]:
y_pred = logreg.predict(X)

### 75. 素性の重み

In [11]:
AllCoef = np.argsort(np.abs(logreg.coef_))[:, ::-1]
BestCoef = AllCoef[:, :10]
WorstCoef = AllCoef[:, -10:]

print("素性の重い単語")
for i in BestCoef[0, ]:
    print(AllWords[i])
print("\n素性の軽い単語")
for i in WorstCoef[0, ]:
    print(AllWords[i])

素性の重い単語
boring
fails
dull
wonderful
powerful
engrossing
unexpected
bore
badly
enjoyable

素性の軽い単語
senior
jir
silberstein
hubac
starving
placid
destinees
communicate
lipstick
components


### 76. ラベル付け

In [12]:
probability = logreg.predict_proba(X)[:, 1]
for ans, pred, prob in zip(y, y_pred, probability):
    print("{}\t{}\t{}".format(ans, pred, prob))

0	0	0.3646405946767133
1	1	0.8110227047133876
1	1	0.6381999367057356
1	1	0.8561303918420213
1	1	0.9617822715075339
1	1	0.8274647772005942
1	1	0.9049021148956689
0	0	0.028360601596218865
1	1	0.8163144452059062
0	0	0.11328745198636864
1	1	0.7990033182832055
1	1	0.664905081593474
1	1	0.7488639745012319
1	1	0.829177271679781
1	1	0.7600026825761486
1	1	0.8588461101809903
1	1	0.9600527649181643
0	0	0.2033882023799316
0	0	0.45992132615704234
1	1	0.8202817536425386
1	1	0.7484615610856986
1	1	0.7520297815389113
0	0	0.12120521174164721
0	0	0.21949374237977815
0	0	0.2469053860218125
1	1	0.9086072675720142
0	0	0.12257902502739261
0	0	0.03523657258600131
1	1	0.9450294391973133
1	1	0.6366833322167749
1	0	0.3238331479003401
0	0	0.21993276750368698
0	0	0.31053654607589826
0	0	0.20300744339660284
0	0	0.10122195335083194
1	0	0.4125645440982545
1	1	0.7190651177993955
1	1	0.8691931346375699
1	1	0.7457871896866863
1	1	0.9743003127873929
1	1	0.7919465157852505
0	0	0.07733644765451977
0	0	0.34375176180243366

1	1	0.795911963524284
1	0	0.4332255224428834
1	1	0.7225294951166832
1	0	0.39260050904929183
0	0	0.4489553742486167
0	0	0.06338951822908738
1	1	0.8766778446692645
1	1	0.9596440384701685
1	1	0.7004590579879038
0	0	0.07598429696084047
0	0	0.050650799776511185
1	1	0.7666966085508405
1	1	0.9563612640417931
1	1	0.9552497796034825
0	0	0.1509420974959549
0	0	0.06311687632441793
1	1	0.8929830547638298
1	1	0.9081090861241374
1	1	0.8202839500613333
0	0	0.029593396018721248
1	1	0.6713502746796832
1	1	0.7686386045371628
0	0	0.2554020589542709
0	0	0.0317957493296183
0	0	0.03798164345059663
0	0	0.3393363989337084
0	0	0.06468391900249766
1	1	0.8929154415688622
1	1	0.7039046225563425
1	1	0.5978655684778966
0	0	0.25892266055047986
0	0	0.04883436700063615
1	1	0.5253220360538694
0	0	0.1178356929325631
1	1	0.8345859845557434
1	1	0.8422871972695934
0	0	0.1733562553494984
0	0	0.09514837065466937
1	1	0.8875227489603924
1	0	0.4838614031660025
1	1	0.9768241522961282
0	0	0.37964967544147343
0	0	0.201898899129195

0	0	0.10223045945493606
0	0	0.12381999033343088
1	1	0.9444643160354448
0	0	0.2733415973576823
0	0	0.12186160803029632
0	0	0.1380364246451344
1	0	0.437161368601008
1	1	0.5895908368867205
0	0	0.09871309565091298
1	1	0.7505743200061409
0	0	0.09744066707123539
1	1	0.8451331221718761
0	0	0.23794571183805485
1	1	0.9173680442229436
0	0	0.282218383827572
0	0	0.03180060750978536
1	1	0.8971743175088565
0	0	0.08928339699051825
0	0	0.13735470527512114
1	1	0.951703506315979
1	1	0.8935360827844301
1	1	0.5996295196778857
1	1	0.9512480453142376
0	1	0.6583687773630171
1	1	0.7967382726858601
1	1	0.5841720542454492
0	0	0.10015385920280986
1	0	0.4775663008833438
0	0	0.039357462929503345
1	1	0.7672010595866373
0	0	0.13980978245724346
0	0	0.13255962482790382
1	1	0.7360362939203021
0	0	0.22002007159889875
0	0	0.273098065995793
1	1	0.8952622882931117
0	0	0.2779014260229897
1	1	0.9960930514078746
0	0	0.0630379549796625
1	1	0.678660155737786
1	1	0.9350008090980513
0	0	0.035683527342807046
0	0	0.0709796242679508

1	1	0.7111424055899991
1	1	0.8887807378988071
1	1	0.8327431990963173
0	0	0.1682665665632413
0	0	0.14133059620288424
1	1	0.8910162347865284
1	1	0.661723359878695
1	1	0.8532094015206111
0	1	0.5476006232936443
1	1	0.8276020062178024
1	1	0.9268802398158407
0	0	0.39559319652093344
1	1	0.658799536019746
0	0	0.009525510825915498
0	0	0.3048249057129272
0	0	0.07066184617446723
1	1	0.9376322652163209
1	1	0.9184053909244314
0	0	0.40243473153047027
0	0	0.1350775192357476
0	0	0.1488456050004153
0	0	0.4297461464919018
1	1	0.9463599401753405
1	1	0.9591790683065731
0	0	0.13990944586864584
1	1	0.7965469435261882
0	0	0.03255359663245765
0	0	0.037156292767443984
1	1	0.8808476460409495
0	0	0.29547518997458333
1	1	0.9781382931156413
1	1	0.8112009696299792
0	1	0.5234113808804247
0	0	0.12197284374443117
0	0	0.4717554432671376
1	1	0.7527721673799102
1	1	0.5814995531987489
0	0	0.3181573306123646
0	0	0.053974034624609375
1	1	0.8196925626878265
0	0	0.09071150642216157
1	1	0.6904218830590771
0	0	0.105705415000190

0	0	0.04585246460763754
0	0	0.13986336087885554
0	0	0.12729460881307905
0	0	0.09853352066879166
1	1	0.7233131515880163
0	0	0.03613446398951262
1	1	0.899966667525178
1	1	0.7323553674448394
1	1	0.9534886724224465
1	1	0.721445990451829
1	1	0.7186877052545678
1	1	0.8443750388555314
1	1	0.7293137027183821
0	0	0.1260530305363004
0	0	0.012507962072440305
0	0	0.1584445325872711
0	0	0.046809466467109116
0	0	0.07293599808594406
0	0	0.27895685400919723
1	1	0.5060189953421169
0	0	0.30163390122977385
0	0	0.17215131735162353
1	1	0.8592634397405872
0	0	0.13493123140176874
1	1	0.7223839935332839
1	1	0.8461616546996933
0	0	0.48458774899718526
0	0	0.09376838414348927
1	1	0.9644464432726524
0	0	0.022662568513422004
1	1	0.9834518529393219
1	1	0.5334028063200961
1	1	0.9902216812708498
1	1	0.8200652236352376
0	0	0.2072486806991634
0	0	0.21743303383105034
0	0	0.05012060385983757
1	1	0.9066663017641065
0	0	0.14560046970691068
1	1	0.8996559494779974
0	0	0.4374866601266768
1	1	0.697866017006772
1	1	0.6649476938

0	0	0.05221367812489387
1	1	0.8934895190058225
1	1	0.7348866632323272
0	0	0.0772808721681599
0	0	0.19275208423474152
1	1	0.7732019082077333
1	1	0.9814970946738567
0	0	0.07001083788032308
1	1	0.6941587493261425
1	1	0.9649718784431478
1	1	0.6749622348189842
1	1	0.7623769337288097
1	1	0.7761663169499945
1	1	0.57491964765055
1	1	0.975585241063398
1	1	0.8275205108627048
1	1	0.8873680818309986
1	1	0.7626604786472481
0	0	0.1415846383813037
1	1	0.8848640521169002
0	0	0.08450983318167943
0	0	0.254752599501138
1	1	0.7610169916968976
1	1	0.9540508807692435
1	1	0.9906709489228533
0	0	0.3793152726692028
1	1	0.5194665359527855
0	0	0.14386187816332677
0	0	0.10291693914719259
1	1	0.9409988210075876
0	0	0.009291366852032969
0	0	0.03445178595452377
1	1	0.8144826283499031
1	0	0.4513654843837563
1	1	0.6944022112450055
1	1	0.9629932320351633
0	0	0.33109189685931073
0	0	0.4397186859749948
0	0	0.3506576493272103
0	0	0.1821492169006255
1	1	0.9606041295314052
1	1	0.7781942295734564
1	1	0.9814424361261824
0	1	0

0	0	0.08466744573577482
0	0	0.35226252927871654
0	0	0.0746434046787213
0	0	0.2582556752797114
1	1	0.6567920996618613
0	0	0.4651403739762959
0	0	0.04366234678504295
0	0	0.04641404732040604
0	0	0.07204353931539109
1	1	0.8316577818075306
1	1	0.9934371346224671
1	1	0.9592827473522209
0	0	0.12499609438886321
0	0	0.13272137184488894
0	0	0.14371175126304744
0	0	0.17750559623172038
1	1	0.9105137150000713
0	0	0.05811689188353077
1	1	0.554573457959541
1	1	0.6476764617760695
0	0	0.33660682481158166
0	0	0.4882390943022622
1	1	0.626990256143971
1	1	0.9167474101097727
1	1	0.739580534961211
0	0	0.008401473480319178
0	0	0.039306631591885015
0	0	0.33080950889124566
0	0	0.17916226325275744
0	0	0.02546270171213032
1	1	0.5200378464570756
1	1	0.8228246273294637
1	1	0.9563805307996976
1	1	0.9943727516662999
0	0	0.046288469568081976
0	0	0.053597709334732034
0	0	0.09519293791502935
1	1	0.6775171151021646
0	0	0.15012503184419218
0	0	0.03138105205927242
1	1	0.9970523108271728
1	1	0.5788746626613842
0	0	0.214467

1	1	0.9548261159127535
1	1	0.8491170905983345
0	0	0.0769769366423576
0	0	0.18021287042562956
1	1	0.9152525227631833
0	0	0.03813936641298999
0	0	0.30512498041538716
1	1	0.9525697096278759
1	1	0.7047806495385586
1	1	0.9978463691947723
1	1	0.895657081290623
0	0	0.05584133088377011
0	0	0.3342570332930492
0	0	0.27823659353310726
0	0	0.48972471315158334
0	0	0.31388489010662074
0	0	0.10367550603316693
0	0	0.3307704146485102
1	1	0.7936898532642085
0	0	0.12003581497806627
0	0	0.22702941709582755
1	1	0.8231968179902389
0	0	0.12662667185689372
1	1	0.9290766354289809
0	0	0.24187711555690156
1	1	0.8907040032906548
1	1	0.7500106217608175
0	0	0.3635565042322558
1	1	0.9593127743731319
0	0	0.15185008627214766
0	0	0.16641943136794346
1	1	0.9133254605640567
0	0	0.1660742376712696
0	1	0.868420482863875
0	0	0.08552180886849776
0	0	0.05846517158311253
0	0	0.489332741862519
1	1	0.693273639064686
0	0	0.12465531700020686
1	1	0.7172913647533744
0	0	0.14245650060473664
1	1	0.6975141906745462
0	0	0.23533238054064

### 77. 正解率の計測

In [14]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y, y_pred)

score = (confusion[0, 0]+confusion[1, 1])/(confusion[0, 0]+confusion[0, 1]+confusion[1, 0]+confusion[1, 1])
precision = confusion[1,1]/(confusion[0, 1]+confusion[1,1])
recall = confusion[1,1]/(confusion[1, 0]+confusion[1,1])
f1 = 2*((precision*recall)/(precision+recall))

print("精度: {}\t適合率: {}\n再現率: {}\tF1スコア: {}".format(score, precision, recall, f1))

精度: 0.9656724817107485	適合率: 0.9683078664402943
再現率: 0.9628587507034327	F1スコア: 0.9655756207674944


### 78. 5分割交差検証

In [15]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(logreg, X, y, cv=5)
precisions = cross_val_score(logreg, X, y, cv=5, scoring="precision")
recalls = cross_val_score(logreg, X, y, cv=5, scoring="recall")
f1s = cross_val_score(logreg, X, y, cv=5, scoring="f1")
print("精度:\n{}".format(scores))
print("適合率:\n{}".format(precisions))
print("再現率:\n{}".format(recalls))
print("F1スコア:\n{}".format(f1s))

精度:
[0.74789128 0.75469043 0.76125704 0.74718574 0.75469043]
適合率:
[0.7474275  0.75115634 0.76701822 0.75071361 0.75162187]
再現率:
[0.74882849 0.76172608 0.75046904 0.74015009 0.76078799]
F1スコア:
[0.74812734 0.75640429 0.75865339 0.74539443 0.75617716]


### 79. 適合率-再現率グラフの描画

In [16]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y, logreg.predict_proba(X)[:, 1])
close_zero = np.argmin(np.abs(thresholds))
plt.plot(precision[close_zero], recall[close_zero], "o", markersize=10,
         label="threshold zero", fillstyle="none", c="k", mew=2)
plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("Precision")
plt.ylabel("Recall")

Text(0,0.5,'Recall')

# CountVectorizerを用いた素性抽出

In [18]:
X = []
y = []
with open("Outputs/sentiment.txt") as sentiText:
    for line in sentiText:
        line = line.rstrip()
        X.append(line[3:])
        y.append(line[:2])

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
import re
import spacy

regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load("en")
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

vect = CountVectorizer(tokenizer=custom_tokenizer, stop_words="english")
X = vect.fit_transform(X)

In [20]:
new_y = []
for label in y:
    if label == "+1":
        new_y.append(1)
    else:
        new_y.append(0)
y = np.array(new_y)

In [21]:
lr = LogisticRegression()
lr.fit(X, y)
lr.score(X, y)

0.949634214969049

In [22]:
scores = cross_val_score(lr, X, y, cv=5)
precisions = cross_val_score(lr, X, y, cv=5, scoring="precision")
recalls = cross_val_score(lr, X, y, cv=5, scoring="recall")
f1s = cross_val_score(lr, X, y, cv=5, scoring="f1")
print("精度:\n{}".format(scores))
print("適合率:\n{}".format(precisions))
print("再現率:\n{}".format(recalls))
print("F1スコア:\n{}".format(f1s))

精度:
[0.74507966 0.74906191 0.74812383 0.7293621  0.75093809]
適合率:
[0.74280409 0.75165877 0.74696545 0.72914714 0.74340309]
再現率:
[0.7497657  0.74390244 0.75046904 0.72983114 0.76641651]
F1スコア:
[0.74626866 0.74776049 0.74871315 0.72948898 0.75473441]
