In [1]:
import dataset
import model
import tensorflow as tf
import json
import numpy as np

def build_model(name, skip_lines, limit_lines, lstm, run_steps):    
    output_dir = "model_{name}".format(name=name)
    model_fn = model.model_fn
    
    dev_dataset_raw = dataset.preprocess_dataset("dataset/*.csv", skip_lines=skip_lines, limit_lines=limit_lines)
    dev_dataset_raw = dataset.parse_dataset(dev_dataset_raw)
    
    train_dataset_raw, test_dataset_raw = dataset.split_dataset(dev_dataset_raw, 0.8)
    vocabulary, word_index, reverse_index = \
        dataset.create_word_index(train_dataset_raw)
    
    train_dataset_raw = dataset.upsampling(train_dataset_raw)
    train_dataset = dataset.convert_word_to_index(train_dataset_raw, word_index)
    test_dataset = dataset.convert_word_to_index(test_dataset_raw, word_index)

    params = model.get_param(
        vocab_size=len(vocabulary),
        category_size=len(dataset.category_names),
        lstm=lstm)

    # モデル
    classifier = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=output_dir,
        params=params)

    # 学習データ
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"sequences": train_dataset.sequences},
        y=train_dataset.labels,
        num_epochs=20,
        batch_size=50,
        shuffle=True)

    # テストデータ
    test_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"sequences": test_dataset.sequences},
        y=test_dataset.labels,
        num_epochs=1,
        shuffle=False)

    for i in range(run_steps // 1000):
        #学習を実行
        classifier.train(
            input_fn=train_input_fn,
            steps=1000)

        # 評価を実行
        result = classifier.evaluate(
            input_fn=test_input_fn)

    words = ["{}\n".format(x) for x in vocabulary]
    with open(output_dir + "/vocab.metadata.tsv", "w", encoding="utf-8") as f:
        f.writelines(words)
    
    with open(output_dir + "/params.json", "w", encoding="utf-8") as f:
        json.dump(params, f)

    # 推論を実行
    predictions = classifier.predict(
        input_fn=test_input_fn)

    # データを準備
    sequences = test_dataset_raw.sequences
    labels = test_dataset_raw.labels
    predicts = [p["classes"] for p in predictions]

    confusion = tf.confusion_matrix(labels=labels, predictions=predicts, num_classes=len(dataset.category_names))

    with tf.Session() as sess:
        confusion_matrix = sess.run(confusion)
    print(confusion_matrix)
    
    print("accuracy:", np.trace(confusion_matrix) / np.sum(confusion_matrix))
    print("recall:", np.mean(np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=1)))
    print("precision:", np.mean(np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=0)))

In [None]:
build_model(
    "lstm_0_1_1",
    limit_lines=1,
    lstm=1,
    run_steps=8000)
build_model(
    "fasttext_0_1_1",
    limit_lines=1,
    lstm=0,
    run_steps=8000)
build_model(
    "lstm_0_10_1",
    limit_lines=10,
    lstm=1,
    run_steps=8000)
build_model(
    "fasttext_0_10_1",
    limit_lines=10,
    lstm=0,
    run_steps=8000)

In [None]:
build_model(
    "fasttext_0_1",
    skip_lines=0,
    limit_lines=1,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_0_10",
    skip_lines=0,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_1_10",
    skip_lines=1,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_2_10",
    skip_lines=2,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_3_10",
    skip_lines=3,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_4_10",
    skip_lines=4,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_5_10",
    skip_lines=5,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_6_10",
    skip_lines=6,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_7_10",
    skip_lines=7,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_8_10",
    skip_lines=8,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_9_10",
    skip_lines=9,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_10_10",
    skip_lines=10,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_15_10",
    skip_lines=15,
    limit_lines=10,
    lstm=0,
    run_steps=5000)
build_model(
    "fasttext_20_10",
    skip_lines=20,
    limit_lines=10,
    lstm=0,
    run_steps=5000)

In [None]:
build_model(
    "lstm_0_1",
    skip_lines=0,
    limit_lines=1,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_0_10",
    skip_lines=0,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_1_10",
    skip_lines=1,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_2_10",
    skip_lines=2,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_3_10",
    skip_lines=3,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_4_10",
    skip_lines=4,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_5_10",
    skip_lines=5,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_6_10",
    skip_lines=6,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_7_10",
    skip_lines=7,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_8_10",
    skip_lines=8,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_9_10",
    skip_lines=9,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_10_10",
    skip_lines=10,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_15_10",
    skip_lines=15,
    limit_lines=10,
    lstm=1,
    run_steps=5000)
build_model(
    "lstm_20_10",
    skip_lines=20,
    limit_lines=10,
    lstm=1,
    run_steps=5000)

In [None]:
import numpy as np
import tensorflow as tf

def test_on_final_dataset(name):
    output_dir = "model_{name}".format(name=name)
    model_fn = model.model_fn
    
    with open(output_dir + "/vocab.metadata.tsv", "r", encoding="utf-8") as f:
        word_index = {line.strip("\n"):i for i, line in enumerate(f)}
        vocab_size = len(word_index)
    
    with open(output_dir + "/params.json", "r", encoding="utf-8") as f:
        params = json.load(f)
        
    skip_lines=int(name.split("_")[1])
    limit_lines=int(name.split("_")[2])
    final_dataset_raw = dataset.preprocess_dataset("final_dataset/*.csv", skip_lines=skip_lines, limit_lines=limit_lines)
    final_dataset_raw = dataset.parse_dataset(final_dataset_raw)
    final_dataset = dataset.convert_word_to_index(final_dataset_raw, word_index)
    
    
    # モデル
    classifier = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=output_dir,
        params=params)
    
    # テストデータ
    final_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"sequences": final_dataset.sequences},
        y=final_dataset.labels,
        num_epochs=1,
        shuffle=False)
    
    # 評価を実行
    result = classifier.evaluate(
        input_fn=final_input_fn)
    
    # 推論を実行
    predictions = classifier.predict(
        input_fn=final_input_fn)
    predictions = list(predictions)
    
    sequences = final_dataset_raw.sequences
    labels = final_dataset_raw.labels
    predicts = [p["classes"] for p in predictions]
    confusion = tf.confusion_matrix(labels=labels, predictions=predicts, num_classes=len(dataset.category_names))

    with tf.Session() as sess:
        confusion_matrix = sess.run(confusion)
    print(confusion_matrix)
    print("accuracy:", np.trace(confusion_matrix) / np.sum(confusion_matrix))
    
    return predictions

def test_all(names):
    predictions = []
    for name in names:
        each_predictions = np.array([p["probabilities"] for p in test_on_final_dataset(name)])
        with open("ensemble_result/{name}".format(name=name), "wb") as f:
            np.save(f, each_predictions)

In [None]:
names = ["fasttext_{}_10".format(i) for i in range(11)] + ["lstm_{}_10".format(i) for i in range(11)] + \
        ["fasttext_{}_10".format(i) for i in [15, 20]] + ["lstm_{}_10".format(i) for i in [15, 20]] + \
        ["fasttext_0_{}_1".format(i) for i in [1, 10]] + ["lstm_0_{}_1".format(i) for i in [1, 10]]
test_all(names)

In [30]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import dataset

def draw_confusion_matrix(confusion_matrix, name):
    # リコール(再現率)に変換
    # ラベルごとに正規化するほうがリコール(再現率)
    # 予測結果ごとに正規化するほうがプレシジョン(適合率)
    recall_matrix = []
    for confusion_row_for_label in confusion_matrix:

        # ラベルごとのデータ数を計算
        label_count = sum(confusion_row_for_label, 0)

        # ラベルごとのリコールを計算
        recall_row = [float(i)/float(label_count)
                      for i in confusion_row_for_label]

        # recall_matrixに追加
        recall_matrix.append(recall_row)

        # なんか描画データが残ってたら破棄
        plt.clf()

    # plotを準備
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_aspect(1)

    # confusion matrixの色の部分(リコール)を描画
    res = ax.imshow(
        np.array(recall_matrix),
        cmap=plt.cm.YlOrBr, 
        #norm=LogNorm(), # 対数スケール
        interpolation='nearest')

    # 幅と高さを取得
    width, height = confusion_matrix.shape

    # confusion matrixの数値の部分を描画
    for x in range(width):
        for y in range(height):
            ax.annotate(
                #str(confusion_matrix[x][y]),
                "{:.02f}".format(recall_matrix[x][y]),
                xy=(y, x), 
                horizontalalignment='center',
                verticalalignment='center')

    # カラーバーを描画
    fig.colorbar(res)

    # x, y軸を描画
    plt.xticks(range(width), dataset.category_names[:width])
    plt.yticks(range(height), dataset.category_names[:height])

    # 保存
    plt.savefig(name, format='png')
    plt.close('all')

In [31]:
import numpy as np
import dataset

def show_example(sequences, labels, predictions):
    top_n = 50
    highers = []
    lowers = []
    for i in range(len(dataset.category_names)):
        category_rank = [(s, lbl, p) for s, lbl, p in zip(sequences, labels, predictions) if lbl == i]
        category_rank = sorted(category_rank, key=(lambda tpl: (tpl[2]["probabilities"][i])), reverse=True)
        highers.append(category_rank[:top_n])
        lowers.append(category_rank[-top_n:][::-1])

    def print_sequence(sequence, label, predict):
        sequence = [word
                    for word in (sequence[:50] + (["...。"] if len(sequence) > 50 else ["。"]))]
        print("{label}: {sequence} ({probability} {prob2} {maxp})".format(
            label=dataset.category_names[label],
            sequence="".join(sequence),
            probability=float(predict["probabilities"][label]),
            prob2=dataset.category_names[np.argmax(predict["probabilities"])],
            maxp=np.max(predict["probabilities"])
            ))

    for i in range(len(dataset.category_names)):
        print("{}らしい文書".format(dataset.category_names[i]))
        for s, lbl, p in highers[i]:
            print_sequence(s, lbl, p)
        print()
        print("{}らしからぬ文書".format(dataset.category_names[i]))
        for s, lbl, p in lowers[i]:
            print_sequence(s, lbl, p)
        print()

In [56]:
def predict_ensemble(names, ensemble_name):
    final_dataset_raw = dataset.preprocess_dataset("final_dataset/*.csv", limit_lines=1)
    final_dataset_raw = dataset.parse_dataset(final_dataset_raw)
    sequences = final_dataset_raw.sequences
    labels = final_dataset_raw.labels
    
    # アンサンブル
    predictions = []
    for name in names:
        with open("ensemble_result/{name}".format(name=name), "rb") as f:
            each_predictions = np.load(f)   
            predictions.append(each_predictions)
    predictions = np.array(predictions)
    ensamble_predictions = np.mean(predictions, axis=0)
    
    # 結果を保存
    ensamble_predictions = [
        {
            "probabilities": p,
            "classes": np.argmax(p)
        } for p in ensamble_predictions]
    
    return sequences, labels, ensamble_predictions

def run_test_ensemble(names, model_name):
    
    # 予測
    sequences, labels, predicts = \
        predict_ensemble(names, model_name)
    
    # 混同行列を計算
    classes = [p["classes"] for p in predicts]
    confusion = tf.confusion_matrix(
        labels=labels,
        predictions=classes,
        num_classes=len(dataset.category_names))
    with tf.Session() as sess:
        confusion_matrix = sess.run(confusion)
    
    # 正解率
    accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
    
    # マイクロ指標
    micro_recall = np.sum(np.diag(confusion_matrix)) / np.sum(confusion_matrix)
    micro_precision = np.sum(np.diag(confusion_matrix)) / np.sum(confusion_matrix)
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
    
    # マクロ指標
    macro_recall = np.mean(np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=1))
    macro_precision = np.mean(np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=0))
    macro_f1 = 2 * (macro_precision * macro_recall) / (macro_precision + macro_recall)
    
    # 出力
    print("[model {name}]".format(name=model_name))
    print(confusion_matrix)
    
    print("accuracy:", accuracy)
    
    print("micro_recall:", micro_recall)
    print("micro_precision:", micro_precision)
    print("micro_f1:", micro_f1)
    
    print("macro_recall:", macro_recall)
    print("macro_precision:", macro_precision)
    print("macro_f1:", macro_f1)
    
    with open("result/model_{name}.txt".format(name=model_name), "w") as f:
        f.write("[model {name}]\n".format(name=model_name))
        f.write(str(confusion_matrix) + "\n")
        
        f.write("accuracy:{}\n".format(accuracy))
        f.write("micro_recall:{}\n".format(micro_recall))
        f.write("micro_precision:{}\n".format(micro_precision))
        f.write("micro_f1:{}\n".format(micro_f1))
        
        f.write("macro_recall:{}\n".format(macro_recall))
        f.write("macro_precision:{}\n".format(macro_precision))
        f.write("macro_f1:{}\n".format(macro_f1))

    # 混同行列を画像出力
    draw_confusion_matrix(confusion_matrix,
                          "confusion_matrix/model_{name}.png".format(name=model_name))

In [63]:
for i in range(0, 11):
    names = ["fasttext_{}_10".format(i)]
    print(names)
    run_test_ensemble(names, model_name="fasttext_{i}".format(i=i))

['fasttext_0_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model fasttext_0]
[[1826  125  136    2   43   46  140   40]
 [  68 1675  170   14   67  176  170  127]
 [  87  131  967    4   20   30   67   80]
 [  12   15   15   57    2    3    1    4]
 [  17   57   14    2  460   38   23    6]
 [  28  129   33    0   34 1146   57   62]
 [ 123  229  129    1   19   62 2295  410]
 [  39  179  105    0   11   86  381 2274]]
accuracy: 0.724490486831
micro_recall: 0.724490486831
micro_precision: 0.724490486831
micro_f1: 0.724490486831
macro_recall: 0.703867241896
macro_precision: 0.71639192718
macro_f1: 0.710074359497
['fasttext_1_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model fasttext_1]
[[1789  144  126    3   59   42  160   35]
 [  84 1549  176    7   64  189  228  170]
 [ 125  179  824    4   25   42   75  112]
 [  20   22   16   4

In [64]:
for i in range(0, 11):
    names = ["lstm_{}_10".format(i)]
    print(names)
    run_test_ensemble(names, model_name="lstm_{i}".format(i=i))

['lstm_0_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model lstm_0]
[[1805  135  123    3   46   57  157   32]
 [  67 1663  179   14   63  178  192  111]
 [  89  136  945    5   23   28   78   82]
 [   9   16   13   59    5    3    2    2]
 [  17   53    9    3  455   48   27    5]
 [  30  132   27    0   33 1142   63   62]
 [ 104  242  119    0   15   62 2374  352]
 [  40  176  102    0   10   94  403 2250]]
accuracy: 0.724016521091
micro_recall: 0.724016521091
micro_precision: 0.724016521091
micro_f1: 0.724016521091
macro_recall: 0.703152789669
macro_precision: 0.714701294508
macro_f1: 0.708880010463
['lstm_1_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model lstm_1]
[[1778  117  133    3   55   53  179   40]
 [  86 1470  197    4   64  210  243  193]
 [ 100  155  873    4   24   37   74  119]
 [  15   19   18   45    3    1    

In [65]:
for i in range(0, 11):
    names = ["fasttext_{}_10".format(i), "lstm_{}_10".format(i)]
    print(names)
    run_test_ensemble(names, model_name="fasttext_lstm_{i}".format(i=i))

['fasttext_0_10', 'lstm_0_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model fasttext_lstm_0]
[[1824  124  127    3   44   51  147   38]
 [  69 1668  177   15   66  177  176  119]
 [  92  130  967    4   20   27   69   77]
 [  11   17   13   59    3    2    2    2]
 [  18   53   10    2  459   45   25    5]
 [  23  129   30    0   33 1150   61   63]
 [ 109  231  120    0   15   62 2356  375]
 [  38  173  103    0    9   86  393 2273]]
accuracy: 0.728282212743
micro_recall: 0.728282212743
micro_precision: 0.728282212743
micro_f1: 0.728282212743
macro_recall: 0.708125899597
macro_precision: 0.719662386104
macro_f1: 0.713847535618
['fasttext_1_10', 'lstm_1_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model fasttext_lstm_1]
[[1798  129  127    2   57   46  162   37]
 [  74 1522  183    6   61  203  234  184]
 [ 113  170  852    4   23

In [67]:
for i in range(1, 11):
    names = ["fasttext_{}_10".format(j) for j in range(i)] + ["lstm_{}_10".format(j) for j in range(i)]
    print(names)
    run_test_ensemble(names, model_name="ensemble_0-{i}".format(i=i))

['fasttext_0_10', 'lstm_0_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model ensemble_0-1]
[[1824  124  127    3   44   51  147   38]
 [  69 1668  177   15   66  177  176  119]
 [  92  130  967    4   20   27   69   77]
 [  11   17   13   59    3    2    2    2]
 [  18   53   10    2  459   45   25    5]
 [  23  129   30    0   33 1150   61   63]
 [ 109  231  120    0   15   62 2356  375]
 [  38  173  103    0    9   86  393 2273]]
accuracy: 0.728282212743
micro_recall: 0.728282212743
micro_precision: 0.728282212743
micro_f1: 0.728282212743
macro_recall: 0.708125899597
macro_precision: 0.719662386104
macro_f1: 0.713847535618
['fasttext_0_10', 'fasttext_1_10', 'lstm_0_10', 'lstm_1_10']
preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model ensemble_0-2]
[[1873  125  109    2   45   39  136   29]
 [  59 1682  174    7   53  172  191  129]
 [

In [61]:
names =  ["fasttext_0_10", "fasttext_0_10_1", "lstm_0_10", "lstm_0_10_1"]
run_test_ensemble(names, model_name="first_line_only_ensemble")    

preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model first_line_only_ensemble]
[[1835  126  117    3   42   49  150   36]
 [  61 1664  167   14   61  164  207  129]
 [  87  143  942    5   21   33   75   80]
 [   9   17   14   57    2    3    4    3]
 [  20   56   11    0  454   42   25    9]
 [  36  134   31    0   29 1125   61   73]
 [ 107  213  119    1   14   56 2403  355]
 [  35  171   80    0   10   74  436 2269]]
accuracy: 0.727808247004
micro_recall: 0.727808247004
micro_precision: 0.727808247004
micro_f1: 0.727808247004
macro_recall: 0.702481521887
macro_precision: 0.721713502008
macro_f1: 0.711967659977


In [62]:
names = ["fasttext_{}_10".format(i) for i in range(11)] + ["lstm_{}_10".format(i) for i in range(11)] + \
        ["fasttext_{}_10".format(i) for i in [15, 20]] + ["lstm_{}_10".format(i) for i in [15, 20]] + \
        ["fasttext_0_{}_1".format(i) for i in [1, 10]] + ["lstm_0_{}_1".format(i) for i in [1, 10]]
run_test_ensemble(names, model_name="ensemble_all") 

preproces line:  0
preproces line:  10000
preproces line:  14769
parse line:  0
parse line:  10000
parse line:  14769
[model ensemble_all]
[[1917  111  107    2   40   42  108   31]
 [  40 1753  188    3   39  164  158  122]
 [  68   82 1099    2   10   23   34   68]
 [   8   22   17   53    1    3    2    3]
 [  14   51    5    0  488   36   18    5]
 [  15   96   18    0   19 1241   49   51]
 [  62  144  122    0   11   59 2574  296]
 [  14  100   66    0    8   57  357 2473]]
accuracy: 0.785293520211
micro_recall: 0.785293520211
micro_precision: 0.785293520211
micro_f1: 0.785293520211
macro_recall: 0.752369875886
macro_precision: 0.793452682419
macro_f1: 0.772365357183
