# データ読み込み

In [20]:
datapath = "/workdir/DockerML_sandbox/lab_competition/data/"
outpath = "/workdir/DockerML_sandbox/lab_competition/output/01/"


# sudachiの小さい辞書をインポート
!pip install pyproject-toml
!pip install sudachipy sudachidict_core
!pip install scikit-learn

import numpy as np
import collections

[0m

In [21]:
# 各データを読み込みリストに格納
def read_file(path):
    with open(path, mode="r") as f:
        result = f.read().splitlines()
    return result

train_text = read_file(datapath + "text.train.txt")
dev_text = read_file(datapath + "text.dev.txt")
test_text = read_file(datapath + "text.test.txt")
train_label = np.loadtxt(datapath + "label.train.txt")
dev_label = np.loadtxt(datapath + "label.dev.txt")

In [31]:
from sudachipy import tokenizer
from sudachipy import dictionary
from itertools import chain

tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

# 前処理
def text_cleaning(textlist, mode, clear_part_of_speech_list):
    morphemelist = [tokenizer_obj.tokenize(text, mode) for text in textlist]
    result = []
    for morpheme in morphemelist:
        words = []
        for word in morpheme:
            if word.part_of_speech()[0] not in clear_part_of_speech_list:
                words.append(word.normalized_form())
        result.append(words)
    return result

# キーのリストを作る
def create_dictionary(text, threshold):
    dic = collections.Counter(text)
    dic = {key:value for key, value in dic.items() if value >= threshold}
    print(sorted(dic.items(), key=lambda x:x[1], reverse=True)[:20])
    
    # 単語ごとにidをふる
    id_dic = dict(zip(dic.keys(), range(len(dic.keys()))))
    return id_dic

In [32]:
clear_part_of_speech_list = ["補助記号", "助詞"]

train_data = text_cleaning(train_text, mode, clear_part_of_speech_list)
dev_data = text_cleaning(dev_text, mode, clear_part_of_speech_list)
test_data = text_cleaning(test_text, mode, clear_part_of_speech_list)

id_dictionary = create_dictionary(chain.from_iterable(train_data + dev_data), 3)

[('だ', 25463), ('た', 23615), ('為る', 14526), ('てる', 8308), ('ない', 8190), (' ', 5122), ('成る', 4803), ('無い', 4749), ('居る', 4305), ('言う', 3871), ('良い', 3596), ('ます', 3513), ('たい', 3461), ('こと', 3417), ('有る', 3401), ('です', 3204), ('行く', 3120), ('見る', 3113), ('思う', 3029), ('来る', 2926)]


In [36]:
# 前処理結果の確認
def check_prepro(textdata, mode, clear_part_of_speech_list):
    original_textdata = textdata
    clean_textdata = text_cleaning(textdata, mode, clear_part_of_speech_list)
    with open(datapath + "prepro_check.txt", "w") as f:
        for original, clean in zip(original_textdata, clean_textdata):
            f.write(original + " => " + "".join(clean) + "\n")

check_prepro(train_text, mode, clear_part_of_speech_list)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [24]:
def create_sentence_vec(textlist, id_dict):
    vec = np.zeros((len(textlist),len(id_dict)))
    for i, text in enumerate(textlist):
        for word in text:
            if word in id_dict:
                vec[i, id_dict[word]] += 1
    return vec

train_vec = create_sentence_vec(train_data, id_dictionary)
dev_vec = create_sentence_vec(dev_data, id_dictionary)
test_vec = create_sentence_vec(test_data, id_dictionary)

# ロジスティック回帰

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

C_list = [0.05]
best_c = 0
best_c_acc = 0

for c in C_list:
    model = LogisticRegression(random_state=0, max_iter=100000, C=c)
    model.fit(train_vec, train_label)
    dev_pred = model.predict(dev_vec)
    acc = accuracy_score(dev_pred, dev_label)
    print("C = " + str(c) + " report start")
    print("正確率 = %.3f" % (acc))
    print(classification_report(dev_label, dev_pred))
    print("C = " + str(c) + " report end")
    if acc > best_c_acc:
        best_c_acc = acc
        best_c = c

print("タスク終了　best_c = " + str(best_c) + " acc = " + str(best_c_acc))

C = 0.05 report start
正確率 = 0.363
              precision    recall  f1-score   support

        -2.0       0.30      0.07      0.12       310
        -1.0       0.31      0.17      0.22       415
         0.0       0.32      0.66      0.43       647
         1.0       0.46      0.43      0.44       837
         2.0       0.43      0.08      0.13       291

    accuracy                           0.36      2500
   macro avg       0.36      0.28      0.27      2500
weighted avg       0.37      0.36      0.33      2500

C = 0.05 report end
タスク終了　best_c = 0.05 acc = 0.3632


In [29]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

C_list = [0.0042]
best_c = 0
best_c_acc = 0

for c in C_list:
    model = LinearSVC(random_state=0, max_iter=100000, C=c)
    model.fit(train_vec, train_label)
    dev_pred = model.predict(dev_vec)
    acc = accuracy_score(dev_pred, dev_label)
    print("C = " + str(c) + " report start")
    print("正確率 = %.3f" % (acc))
    print(classification_report(dev_label, dev_pred))
    print("C = " + str(c) + " report end")
    if acc > best_c_acc:
        best_c_acc = acc
        best_c = c

print("タスク終了　best_c = " + str(best_c) + " acc = " + str(best_c_acc))

C = 0.0042 report start
正確率 = 0.360
              precision    recall  f1-score   support

        -2.0       0.35      0.06      0.11       310
        -1.0       0.28      0.13      0.17       415
         0.0       0.32      0.69      0.43       647
         1.0       0.45      0.44      0.45       837
         2.0       0.57      0.04      0.08       291

    accuracy                           0.36      2500
   macro avg       0.39      0.27      0.25      2500
weighted avg       0.39      0.36      0.31      2500

C = 0.0042 report end
タスク終了　best_c = 0.0042 acc = 0.3604


In [30]:
# test出力
test_pred = model.predict(test_vec)

with open(outpath + "v0.0.1_LinearSVC.txt","w") as f:
    for l in test_pred:
        temp = str(int(l)) + "\n"
        f.write(temp)