In [1]:
import numpy as np
import pandas as pd
import janome
import matplotlib.pyplot as plt
import pickle
from janome.tokenizer import Tokenizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [2]:
class WordDivider:
    def __init__(self):
        self.wakati = Tokenizer(wakati=True)
        self.t = Tokenizer()

    def wakati_words(self, text):
        if not text:
            return []

        words = []
        text = list(self.wakati.tokenize(text))
        for word in text:
            words.append(word)

        return words

    def surface_words(self, text):
        if not text:
            return []

        words = []
        for token in self.t.tokenize(text):
            words.append(token.base_form)
        return words

    def wakati_text(self, text):
        if not text:
            return []

        words = []
        wakati = list(self.wakati.tokenize(text))
        for word in wakati:
            words.append(word)

        output = ""
        for word in words:
            output += word
            output += " "
        return output


In [3]:
from sklearn.ensemble import StackingClassifier

#アンサンブルの作成(スタッキング）
estimators = [
    ('PA', PassiveAggressiveClassifier()),
    ('linear SVM', SGDClassifier(loss='hinge'))
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

#パイプラインの作成
tfidf = TfidfVectorizer(ngram_range=(1, 5))
lr_tfidf = Pipeline([("vect", tfidf), ("clf", clf)])

df = pd.read_csv("./experiment4.csv")
print(df["label"].value_counts())
x_train, y_train = df.loc[:, "comment_sentence"].tolist(), df.loc[:, "label"].tolist()
wd = WordDivider()
x_train = [wd.wakati_text(text) for text in x_train]

0.0    422
1.0    422
Name: label, dtype: int64


In [4]:
scores = []
iter = 10
test_size = float(1) / iter
cvs = cross_val_score(lr_tfidf, x_train, y_train, cv=10)
print('Cross Validation ( Iter = {0} )'.format(iter))
print('-> Train data size : {:.0f}'.format(len(x_train) - test_size * len(x_train)))
print('-> Test data size  : {:.0f}'.format(len(x_train) * test_size))
print('------------------------')
for i,score in enumerate(cvs):
    print('  k={0}: {1}'.format(i, score))
    scores.append(score)
print('accuracy: %.3f ＋／ー %.3f' % (np.mean(scores), np.std(scores)))

Cross Validation ( Iter = 10 )
-> Train data size : 760
-> Test data size  : 84
------------------------
  k=0: 0.6352941176470588
  k=1: 0.6823529411764706
  k=2: 0.7647058823529411
  k=3: 0.6470588235294118
  k=4: 0.5952380952380952
  k=5: 0.6309523809523809
  k=6: 0.6666666666666666
  k=7: 0.6904761904761905
  k=8: 0.7261904761904762
  k=9: 0.7380952380952381
accuracy: 0.678 ＋／ー 0.051
