# 準備

In [14]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np 

%matplotlib inline
pd.options.display.max_columns = 32

In [15]:
PROJECT_ROOT_PATH = "../../"

datapath = PROJECT_ROOT_PATH + "lab_competition/data/"
outpath = PROJECT_ROOT_PATH + "lab_competition/output/01/"

# sudachiの小さい辞書をインポート
!pip install pyproject-toml
!pip install sudachipy sudachidict_full
!pip install scikit-learn

import numpy as np
import collections


[0m

In [16]:
import re

# 各データを読み込みリストに格納
def read_file(path):
    with open(path, mode="r") as f:
        result = f.read().splitlines()
    return result

def text_cleaning_re(textlist):
    result = []
    for l in textlist:
        tmp = re.sub(r'ww+','ww',l)
        result.append(tmp)
    return result

train_text = text_cleaning_re(read_file(datapath + "text.train.txt"))
dev_text = text_cleaning_re(read_file(datapath + "text.dev.txt"))
test_text = text_cleaning_re(read_file(datapath + "text.test.txt"))
train_label = np.loadtxt(datapath + "label.train.txt")
dev_label = np.loadtxt(datapath + "label.dev.txt")

In [17]:
from sudachipy import tokenizer
from sudachipy import dictionary
from itertools import chain
import re

tokenizer_obj = dictionary.Dictionary(dict="full").create()
mode = tokenizer.Tokenizer.SplitMode.C

# 前処理
def text_cleaning(textlist, mode, clear_part_of_speech_list, stopword_list):
    morphemelist = [tokenizer_obj.tokenize(text, mode) for text in textlist]
    result = []
    for morpheme in morphemelist:
        words = []
        for word in morpheme:
            if word.part_of_speech()[0] not in clear_part_of_speech_list[0]:
                if word.part_of_speech()[1] not in clear_part_of_speech_list[1]:
                    if word.normalized_form() not in stopword_list:
                        words.append(word.normalized_form())
        result.append(" ".join(words))
    return result

In [18]:
# 出現頻度が少ない単語をstopwordとする
def stopwords_occur(textlist, threshold):
    morphemelist = [tokenizer_obj.tokenize(text, mode) for text in textlist]
    words = []
    for morpheme in morphemelist:
        for word in morpheme:
            words.append(word.normalized_form())
    dic = collections.Counter(words)
    dic = {key:value for key, value in dic.items() if value<= threshold}
    return list(dic.keys())

clear_part_of_speech_list = [["助詞", "助動詞"],["数詞"]]

with open(datapath + "stopwords.txt") as f:
    stopword_list = f.read().splitlines()

stopword_occur = stopwords_occur(train_text, 2)

stopword_list.extend(stopword_occur)
print(stopword_list)

train_data = text_cleaning(train_text, mode, clear_part_of_speech_list, stopword_list)
dev_data = text_cleaning(dev_text, mode, clear_part_of_speech_list, stopword_list)
test_data = text_cleaning(test_text, mode, clear_part_of_speech_list, stopword_list)

['。', '、', '.', '為る', '成る', '居る', 'とこ', ':', '/', '_', '-', '〜', '(', ')', '私', '御', '」', '「', '人', '物', 'ー', '言う', 'こと', '見る', '行く', '・', 'さん', 'ちゃん', 'そう', 'よう', ';', '`', '分', '今', '今日', '日', '有る', '又', '来る', '思う', '此の', '時', '新体操', '表情筋', 'シャレード', 'モテキ', '小宮山', '夏樹', 'バッティングセンター', 'COWCOW', 'BIGBANG', '笑む', '立石', '生半可', '達者', 'たんと', 'しい', '外反母趾', '昔夢', 'チョコケーキ', '天地明察', '不確か', '連れ去る', '微か', '絡み付く', '投げ掛ける', '茶碗蒸し', 'シナモンロール', '́з', '2700', 'wy', '愛と誠', '在り来り', 'ミランダ', 'カー', 'お子様', '岩', '整骨院', 'テーピング', '丸腰', '整体', '釣れる', '自己否定', '自己肯定', '鬼太郎', '彦', '門', '豚骨', '茄子', '田楽', 'パトラッシュ', 'エビマヨ', '煮', 'ふじ', 'ピール', 'なか卯', 'ジャーナリスト', '祈り', 'さめる', 'メルヘン', '齧り付く', '同志', '求', '本が好き', '立ち読み', '流し読み', 'standardbookstore', 'ノーベル賞', '山中', '育む', '西田辺', '割り箸', 'ニーハイ', '歯痒い', '信託', '別口', '著作', '田中里奈', 'たなか', 'りな', 'ティーナカリーナ', 'キョンキョン', '悪の教典', 'ニモ', 'ケズ', '出家', 'がぶり', 'カブレ', 'たらふく', 'ホームパーティー', '祭りのあと', 'BARBEE BOYS', '差し込む', 'はだける', '生田斗真', 'てれび戦士', '精神年齢', '体年齢', '串カツ', '有耶無耶', '怪しむ', '挙式', '緩巻き', '冠

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,1))
train_tfidf = vectorizer.fit_transform(train_data)
dev_tfidf = vectorizer.transform(dev_data)
test_tfidf = vectorizer.transform(test_data)

train_vec = train_tfidf.toarray()
dev_vec = dev_tfidf.toarray()
test_vec = test_tfidf.toarray()

print("train = " + str(train_vec.shape))
print("trainlabel = " + str(train_label.shape))
print("dev = " + str(dev_vec.shape))
print("devlabel = " + str(dev_label.shape))
print("test = " + str(test_vec.shape))

train = (30000, 9798)
trainlabel = (30000,)
dev = (2500, 9798)
devlabel = (2500,)
test = (2500, 9798)


In [20]:
# 次元削減
# from sklearn.decomposition import TruncatedSVD

# trun_svd = TruncatedSVD(n_components=3000, n_iter=7, random_state=42)

# trun_svd.fit(train_vec)

# train_vec = trun_svd.transform(train_vec)
# dev_vec =  trun_svd.transform(dev_vec)
# test_vec =  trun_svd.transform(test_vec)

# print("train = " + str(train_vec.shape))
# print("trainlabel = " + str(train_label.shape))
# print("dev = " + str(dev_vec.shape))
# print("devlabel = " + str(dev_label.shape))
# print("test = " + str(test_vec.shape))

# 学習

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

params = [70, 130, 180, 300, 500]
best_param = 0
best_param_acc = 0

for param in params:
    model = RandomForestClassifier(random_state=0, n_estimators = 100, verbose=1, n_jobs=4, max_depth=450, max_features=param)
    model.fit(train_vec, train_label)
    dev_pred = model.predict(dev_vec)
    acc = accuracy_score(dev_pred, dev_label)
    k = cohen_kappa_score(dev_pred, dev_label, weights="quadratic")
    print("param = " + str(param) + " report start")
    print("正確率 = %.3f, Kappa = %.3f" % (acc, k))
    print(classification_report(dev_label, dev_pred))
    print("param = " + str(param) + " report end")
    if acc > best_param_acc:
        best_param_acc = acc
        best_param = param

print("タスク終了　best_param = " + str(best_param) + " acc = " + str(best_param_acc))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   56.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


param = 70 report start
正確率 = 0.328, Kappa = 0.276
              precision    recall  f1-score   support

        -2.0       0.38      0.08      0.14       310
        -1.0       0.24      0.11      0.15       415
         0.0       0.30      0.73      0.42       647
         1.0       0.44      0.32      0.37       837
         2.0       0.25      0.04      0.08       291

    accuracy                           0.33      2500
   macro avg       0.32      0.26      0.23      2500
weighted avg       0.34      0.33      0.28      2500

param = 70 report end


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  3.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


param = 130 report start
正確率 = 0.328, Kappa = 0.270
              precision    recall  f1-score   support

        -2.0       0.32      0.07      0.12       310
        -1.0       0.26      0.12      0.16       415
         0.0       0.30      0.71      0.42       647
         1.0       0.44      0.33      0.38       837
         2.0       0.22      0.04      0.07       291

    accuracy                           0.33      2500
   macro avg       0.31      0.25      0.23      2500
weighted avg       0.33      0.33      0.28      2500

param = 130 report end


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  4.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


param = 180 report start
正確率 = 0.330, Kappa = 0.267
              precision    recall  f1-score   support

        -2.0       0.30      0.07      0.11       310
        -1.0       0.26      0.14      0.18       415
         0.0       0.30      0.70      0.42       647
         1.0       0.44      0.34      0.38       837
         2.0       0.21      0.04      0.06       291

    accuracy                           0.33      2500
   macro avg       0.30      0.26      0.23      2500
weighted avg       0.33      0.33      0.29      2500

param = 180 report end


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  7.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


param = 300 report start
正確率 = 0.318, Kappa = 0.262
              precision    recall  f1-score   support

        -2.0       0.27      0.06      0.10       310
        -1.0       0.25      0.14      0.18       415
         0.0       0.29      0.66      0.40       647
         1.0       0.42      0.33      0.37       837
         2.0       0.25      0.05      0.09       291

    accuracy                           0.32      2500
   macro avg       0.30      0.25      0.23      2500
weighted avg       0.32      0.32      0.28      2500

param = 300 report end


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 23.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


param = 500 report start
正確率 = 0.320, Kappa = 0.251
              precision    recall  f1-score   support

        -2.0       0.28      0.06      0.10       310
        -1.0       0.25      0.16      0.19       415
         0.0       0.29      0.66      0.41       647
         1.0       0.43      0.33      0.38       837
         2.0       0.16      0.04      0.07       291

    accuracy                           0.32      2500
   macro avg       0.28      0.25      0.23      2500
weighted avg       0.31      0.32      0.28      2500

param = 500 report end
タスク終了　best_param = 180 acc = 0.3304


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{
        'n_estimators': [10, 50, 100]
        }]

clf = GridSearchCV(
    RandomForestClassifier(random_state=0), # 識別器
    tuned_parameters, # 最適化したいパラメータセット 
    cv=2, # 交差検定の回数
    n_jobs = 4,
    verbose = 3,
    scoring='accuracy') # モデルの評価関数の指定


In [None]:
clf.fit(train_vec, train_label) 

In [None]:
import pandas as pd

results = pd.DataFrame(clf.cv_results_)
best_estimator = clf.best_estimator_

display(results)
print(best_estimator)

In [None]:
from matplotlib import pyplot as plt

plt.bar([str(i) for i in results["param_C"]], results["mean_test_score"])
plt.title('GridSearch result for LogisticRegression(C=0.2, max_iter=100000, random_state=121) 0.345533)')
plt.xlabel("C")
plt.ylabel("accuracy")