# データ読み込み

In [1]:
datapath = "/workdir/DockerML_sandbox/lab_competition/data/"
outpath = "/workdir/DockerML_sandbox/lab_competition/output/01/"


# sudachiの小さい辞書をインポート
!pip install pyproject-toml
!pip install sudachipy sudachidict_core
!pip install scikit-learn

import numpy as np
import collections

[0m

In [2]:
# 各データを読み込みリストに格納
def read_file(path):
    with open(path, mode="r") as f:
        result = f.read().splitlines()
    return result

train_text = read_file(datapath + "text.train.txt")
dev_text = read_file(datapath + "text.dev.txt")
test_text = read_file(datapath + "text.test.txt")
train_label = np.loadtxt(datapath + "label.train.txt")
dev_label = np.loadtxt(datapath + "label.dev.txt")

In [3]:
from sudachipy import tokenizer
from sudachipy import dictionary
from itertools import chain

tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

# 前処理
def text_cleaning(textlist, mode, clear_part_of_speech_list):
    morphemelist = [tokenizer_obj.tokenize(text, mode) for text in textlist]
    result = []
    for morpheme in morphemelist:
        words = []
        for word in morpheme:
            if word.part_of_speech()[0] not in clear_part_of_speech_list:
                words.append(word.normalized_form())
        result.append(words)
    return result

# キーのリストを作る
def create_dictionary(text, threshold):
    dic = collections.Counter(text)
    dic = {key:value for key, value in dic.items() if value >= threshold}
    print(sorted(dic.items(), key=lambda x:x[1], reverse=True)[:20])
    
    # 単語ごとにidをふる
    id_dic = dict(zip(dic.keys(), range(len(dic.keys()))))
    return id_dic

In [4]:
clear_part_of_speech_list = ["補助記号", "助詞"]

train_data = text_cleaning(train_text, mode, clear_part_of_speech_list)
dev_data = text_cleaning(dev_text, mode, clear_part_of_speech_list)
test_data = text_cleaning(test_text, mode, clear_part_of_speech_list)

id_dictionary = create_dictionary(chain.from_iterable(train_data + dev_data), 3)

[('だ', 25463), ('た', 23615), ('為る', 14526), ('てる', 8308), ('ない', 8190), (' ', 5122), ('成る', 4803), ('無い', 4749), ('居る', 4305), ('言う', 3871), ('良い', 3596), ('ます', 3513), ('たい', 3461), ('こと', 3417), ('有る', 3401), ('です', 3204), ('行く', 3120), ('見る', 3113), ('思う', 3029), ('来る', 2926)]


In [5]:
# 前処理結果の確認
def check_prepro(textdata, mode, clear_part_of_speech_list):
    original_textdata = textdata
    clean_textdata = text_cleaning(textdata, mode, clear_part_of_speech_list)
    with open(datapath + "prepro_check.txt", "w") as f:
        for original, clean in zip(original_textdata, clean_textdata):
            f.write(original + " => " + "".join(clean) + "\n")

check_prepro(train_text, mode, clear_part_of_speech_list)

In [6]:
def create_sentence_vec(textlist, id_dict):
    vec = np.zeros((len(textlist),len(id_dict)))
    for i, text in enumerate(textlist):
        for word in text:
            if word in id_dict:
                vec[i, id_dict[word]] += 1
    return vec

train_vec = create_sentence_vec(train_data, id_dictionary)
dev_vec = create_sentence_vec(dev_data, id_dictionary)
test_vec = create_sentence_vec(test_data, id_dictionary)

# ロジスティック回帰

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

C_list = [0.5, 0.25, 0.1, 0.05, 0.01]
best_c = 0
best_c_acc = 0

for c in C_list:
    model = LogisticRegression(random_state=0, max_iter=100000, C=c)
    model.fit(train_vec, train_label)
    dev_pred = model.predict(dev_vec)
    acc = accuracy_score(dev_pred, dev_label)
    print("C = " + str(c) + " report start")
    print("正確率 = %.3f" % (acc))
    print(classification_report(dev_label, dev_pred))
    print("C = " + str(c) + " report end")
    if acc > best_c_acc:
        best_c_acc = acc
        best_c = c

print("タスク終了　best_c = " + str(best_c) + " acc = " + str(best_c_acc))

C = 0.5 report start
正確率 = 0.352
              precision    recall  f1-score   support

        -2.0       0.29      0.16      0.20       310
        -1.0       0.26      0.23      0.24       415
         0.0       0.32      0.53      0.40       647
         1.0       0.47      0.41      0.44       837
         2.0       0.31      0.17      0.22       291

    accuracy                           0.35      2500
   macro avg       0.33      0.30      0.30      2500
weighted avg       0.36      0.35      0.34      2500

C = 0.5 report end
C = 0.25 report start
正確率 = 0.353
              precision    recall  f1-score   support

        -2.0       0.28      0.13      0.18       310
        -1.0       0.28      0.22      0.24       415
         0.0       0.32      0.57      0.41       647
         1.0       0.46      0.42      0.44       837
         2.0       0.31      0.13      0.18       291

    accuracy                           0.35      2500
   macro avg       0.33      0.29      0.29  

# 線形回帰

In [29]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

C_list = [0.0042]
best_c = 0
best_c_acc = 0

for c in C_list:
    model = LinearSVC(random_state=0, max_iter=100000, C=c)
    model.fit(train_vec, train_label)
    dev_pred = model.predict(dev_vec)
    acc = accuracy_score(dev_pred, dev_label)
    print("C = " + str(c) + " report start")
    print("正確率 = %.3f" % (acc))
    print(classification_report(dev_label, dev_pred))
    print("C = " + str(c) + " report end")
    if acc > best_c_acc:
        best_c_acc = acc
        best_c = c

print("タスク終了　best_c = " + str(best_c) + " acc = " + str(best_c_acc))

C = 0.0042 report start
正確率 = 0.360
              precision    recall  f1-score   support

        -2.0       0.35      0.06      0.11       310
        -1.0       0.28      0.13      0.17       415
         0.0       0.32      0.69      0.43       647
         1.0       0.45      0.44      0.45       837
         2.0       0.57      0.04      0.08       291

    accuracy                           0.36      2500
   macro avg       0.39      0.27      0.25      2500
weighted avg       0.39      0.36      0.31      2500

C = 0.0042 report end
タスク終了　best_c = 0.0042 acc = 0.3604


# 決定木

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model = RandomForestClassifier()
model.fit(train_vec, train_label)
dev_pred = model.predict(dev_vec)
acc = accuracy_score(dev_pred, dev_label)
print("正確率 = %.3f" % (acc))
print(classification_report(dev_label, dev_pred))

正確率 = 0.323
              precision    recall  f1-score   support

        -2.0       0.23      0.06      0.10       310
        -1.0       0.24      0.13      0.16       415
         0.0       0.30      0.70      0.42       647
         1.0       0.44      0.32      0.37       837
         2.0       0.20      0.06      0.09       291

    accuracy                           0.32      2500
   macro avg       0.28      0.25      0.23      2500
weighted avg       0.32      0.32      0.28      2500



In [41]:
# test出力
test_pred = model.predict(test_vec)

print(test_pred)

np.savetxt(outpath + "v0.0.1_RandomForest.txt", test_pred, fmt="%.0f")

[1. 1. 1. ... 0. 1. 0.]


# LightGBM

In [7]:
!pip install lightgbm

[0m

In [16]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def change_label_range(label_list):
    return [r+2 for r in label_list]

train_dataset = lgb.Dataset(
    data=train_vec,
    label=change_label_range(train_label)
)

dev_dataset = lgb.Dataset(
    data=dev_vec,
    label=change_label_range(dev_label)
)

params = {
    "task": "train",
    "boosting_type": "gbdt",
    "num_class": 5,
    "objective": "multiclass",
    "metric": {"multi_error"},
    "learning_rate": 0.01,
    "num_leaves": 64,
    "min_data_in_leaf": 20,
    "num_iteration": 1000,
    "verbose": 1,
    "seed":10
}

bst = lgb.train(
    params = params,
    train_set = train_dataset,
    valid_sets = dev_dataset,
    early_stopping_rounds = 100
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6568
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 2177
[LightGBM] [Info] Start training from score -2.136224
[LightGBM] [Info] Start training from score -1.679682
[LightGBM] [Info] Start training from score -1.179063
[LightGBM] [Info] Start training from score -1.352215
[LightGBM] [Info] Start training from score -2.046136
[1]	valid_0's multi_error: 0.7412
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.7412
[3]	valid_0's multi_error: 0.7412
[4]	valid_0's multi_error: 0.7412
[5]	valid_0's multi_error: 0.7412
[6]	valid_0's multi_error: 0.7412
[7]	valid_0's multi_error: 0.7412
[8]	valid_0's multi_error: 0.7412
[9]	valid_0's multi_error: 0.7412
[10]	valid_0's multi_error: 0.7396
[11]	valid_0's multi_error: 0.7404
[12]	valid_0's multi_error: 0.7352
[13]	v

In [34]:
def restore_label_range(label_list):
    return [r-2 for r in label_list]

dev_pred_prob = bst.predict(dev_vec)
dev_pred = restore_label_range(np.argmax(dev_pred_prob, axis=1))
acc = accuracy_score(dev_pred, dev_label)
print("正確率 = %.3f" % (acc))
print(classification_report(dev_label, dev_pred))

正確率 = 0.361
              precision    recall  f1-score   support

        -2.0       0.41      0.08      0.14       310
        -1.0       0.27      0.15      0.19       415
         0.0       0.32      0.69      0.43       647
         1.0       0.48      0.41      0.44       837
         2.0       0.35      0.08      0.13       291

    accuracy                           0.36      2500
   macro avg       0.37      0.28      0.27      2500
weighted avg       0.38      0.36      0.32      2500



In [44]:
def restore_label_range(label_list):
    return [r-2 for r in label_list]

test_pred_prob = bst.predict(test_vec)
np.savetxt(outpath + "/preddata/v0.0.1_LightGBM_pred.txt", test_pred_prob)

test_pred = restore_label_range(np.argmax(test_pred_prob, axis=1))

np.savetxt(outpath + "v0.0.1_LightGBM.txt", test_pred, fmt="%.0f")

In [8]:
!pip install scipy

[0m

In [47]:
# scikit-lean実装

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp

lightgbm_model = lgb.LGBMClassifier(n_estimators=10000, verbose=-1)

param_lgb = {
    'objective'         : 'multiclass',
    'metric'       : ['multi_logloss'],
    'num_class'         : [5],
    "subsample"         : np.arange(0.5, 1.0, 0.1),
    "subsample_freq"    : [1, 2, 4, 8, 16, 32, 64, 128, 256],
    "reg_lambda"        : [1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 0],
    "learning_rate"     : [0.2 ,0.05, 0.01],
    "lambda_l1"         : sp.stats.uniform(1e-8, 10.0),
    "lambda_l2"         : sp.stats.uniform(1e-8, 10.0),
    "num_leaves"        : sp.stats.randint(2, 512),
    "feature_fraction"  : sp.stats.uniform(0.4, 0.6),
    "bagging_fraction"  : sp.stats.uniform(0.4, 0.6),
    "bagging_freq"      : sp.stats.randint(2, 10),
    "min_child_samples" : sp.stats.randint(5, 100)
}


randcv = RandomizedSearchCV(estimator=lightgbm_model, param_distributions=param_lgb, scoring="accuracy", n_jobs=4, verbose=3, cv=5, random_state = 47, n_iter=500)

fit_params = {"callbacks": [lgb.early_stopping(
                  stopping_rounds=100, # 学習時、評価指標がこの回数連続で改善しなくなった時点でストップ
                  verbose=3)],  # 学習中のコマンドライン出力
              "eval_metric": 'multiclass',  # early_stopping_roundsの評価指標
              "eval_set": [(dev_vec, dev_label)]  # early_stopping_roundsの評価指標算出用データ
              }

# ランダムサーチ実行（学習実行）
randcv.fit(train_vec, train_label, **fit_params)

# 最適パラメータの表示と保持
best_params = randcv.best_params_
best_score = randcv.best_score_
print(f"最適パラメータ {best_params}\nスコア {best_score}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[97]	valid_0's multi_logloss: 1.45777
[CV 4/5] END bagging_fraction=0.46809308313618975, bagging_freq=9, feature_fraction=0.9458023938331066, lambda_l1=8.234311750954802, lambda_l2=3.7206340463327505, learning_rate=0.2, metric=multi_logloss, min_child_samples=12, num_class=5, num_leaves=73, objective=u, reg_lambda=1e-05, subsample=0.7, subsample_freq=4;, score=0.340 total time=  15.0s
Early stopping, best iteration is:
[134]	valid_0's multi_logloss: 1.44631
Early stopping, best iteration is:
[117]	valid_0's multi_logloss: 1.45829
Early stopping, best iteration is:
[115]	valid_0's multi_logloss: 1.45386
[CV 3/5] END bagging_fraction=0.46809308313



Early stopping, best iteration is:
[1862]	valid_0's multi_logloss: 1.44735
Early stopping, best iteration is:
[1937]	valid_0's multi_logloss: 1.44098
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2180]	valid_0's multi_logloss: 1.44236
[CV 4/5] END bagging_fraction=0.9679332883898306, bagging_freq=2, feature_fraction=0.8850013821517814, lambda_l1=7.960548299094741, lambda_l2=2.7991926769347266, learning_rate=0.01, metric=multi_logloss, min_child_samples=49, num_class=5, num_leaves=404, objective=t, reg_lambda=0.0001, subsample=0.7, subsample_freq=32;, score=0.344 total time= 1.4min
[CV 3/5] END bagging_fraction=0.9679332883898306, bagging_freq=2, feature_fraction=0.8850013821517814, lambda_l1=7.960548299094741, lambda_l2=2.7991926769347266, learning_rate=0.01, metric=multi_logloss, min_child_samples=49, num_class=5, num_leaves=404, objective=t, reg_lambda=0.0001, subsample=0.7, subsample_freq=32;, score=0.313 total time= 1.5min
[CV 2/5

In [12]:
lightgbm_model = LGBMClassifier(boosting_type="gbdt", objective="multiclass",random_state=43,n_estimators=1000, bagging_fraction=0.9179530830715937, bagging_freq=2, feature_fraction=0.6504748612471201, lambda_l1=0.7847131481453321, lambda_l2=0.3349433426946305, learning_rate=0.05,metric=multi_logloss,min_child_samples=18, num_class= 5, num_leaves=17, 'objective': 'i', 'reg_lambda': 1e-06, 'subsample': 0.5, 'subsample_freq': 64)
lightgbm_model.fit(train_vec, train_label)

KeyboardInterrupt: 

# テスト出力

In [30]:
# test出力
test_pred = model.predict(test_vec)

np.savetxt(outpath + "v0.0.1_LightGBM.txt", test_pred, fmt="%.0f")