In [1]:
# -*- coding: utf-8 -*-
# 設定ファイル

import sys
import os
import collections
import MeCab
import numpy as np
from gensim.models import word2vec
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data


In [1]:


w2v_model = word2vec.Word2Vec.load("/ssd/local-politics/word2vec/word2vec.model")
features = [line.strip().split("\t")[-1] for line in open("features.txt", "r")]


# 単語に切り分ける
# 引数: 文(utf-8)
# 返値: 単語の配列(utf-8)
def get_tango(sen):
    word_list = []
    t = MeCab.Tagger("aa")
    for word_line in t.parse(sen).split("\n"):
        if word_line.strip() == "EOS":
            break
        (word, temp) = word_line.split("\t")
        temps = temp.split(',')
        if "記号" == temps[0]:
            continue
        if "数" == temps[1]:
            continue
        word_list.append(word)
    return word_list
    
    
# 文を受けとり、素性データを用いて、入力データを生成
# 引数: 文(utf-8)
# 返値: NNの入力ベクトル
def sentence2inputs(sentence):
    global features, w2v_model
    word_list = get_tango(sentence)
    return [w2v_model[word] for word in word_list if word in features]


# ファイルパスを受け取り、そこから文を読み込みSVM用に加工し返す
# 引数: ファイルパス, 素性フラグ
# 返値: 素性配列とタグ配列のタプル
def getSvmData(fname, flag):
    # 学習データ読み込み
    lines = [line.strip() for line in open(fname, "r")]
    sentences = [line.split('\t')[-1] for line in lines]
    tags = [int(line.split("\t")[0]) for line in lines]
    #tags = [(i, 0 if i == 1 else 1) for i in temp]
    svmdata = [sentence2inputs(sentence) for sentence in sentences]
    while svmdata.count([]) != 0:
        idx = svmdata.index([])
        del tags[idx]
        del svmdata[idx]
    return (svmdata, tags)

(train, trainTag) = getSvmData('../data/testData.txt', False)
(test, testTag) = getSvmData('../data/additionalTestData.txt', False)


# DatasetのMyクラス: 入力はword2vecを想定
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, tags):
        super(MyDataset, self).__init__()
        assert len(data) == len(tags)
        # npに変換し、0埋めを行う
        max_length = max([len(d) for d in data])
        self.data = np.zeros((len(tags), max_length, len(data[0][0])))
        for i, d1 in enumerate(data):
            for l, d2 in enumerate(d1):
                self.data[i][l] = d2
        self.tags = tags
        
    def __len__(self):
        return len(self.tags)
    
    def __getitem__(self, index):
        return self.data[index], self.tags[index]

In [2]:
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_dim, bidirectional=False, batch_size=1):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.input_size = input_size
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size, hidden_dim, bidirectional=bidirectional)
        self.fc0 = nn.Linear(hidden_dim * 2, 40)
        self.fc1 = nn.Linear(40, 2)
        self.hidden = self.init_hidden()

    def forward(self, inputs, lengths):
        # 行と列を入れ替える
        inputs = inputs.transpose(0, 1)
        pack = torch.nn.utils.rnn.pack_padded_sequence(inputs, lengths)
        lstm_out, self.hidden = self.lstm(pack, self.hidden)
        y = self.fc0(torch.cat([self.hidden[0][-1], self.hidden[0][-2]], 1))
        y = self.fc1(F.tanh(y))
        tag_scores = F.log_softmax(y)
        return tag_scores

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        # (hidden_state, cell_state)のタプルになる
        num = 2 if self.bidirectional else 1
        return (Variable(torch.zeros(num, self.batch_size, self.hidden_dim)).cuda(),
                Variable(torch.zeros(num, self.batch_size, self.hidden_dim)).cuda())


#net = LSTM(len(train[0][0]), 100, True)
net = LSTM(10,100,True)
net.cuda()
print(net)

LSTM(
  (lstm): LSTM(10, 100, bidirectional=True)
  (fc0): Linear(in_features=200, out_features=40, bias=True)
  (fc1): Linear(in_features=40, out_features=2, bias=True)
)


In [5]:
dataset = MyDataset(train, trainTag)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True, num_workers=8)

import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=0.1)
criterion = nn.NLLLoss()
#criterion = nn.CrossEntropyLoss()

for epoch in range(200):  # データセットに渡り複数回ループ

    #データ全てのトータルロス
    epoch_loss = 0.0 
    
    for i, data in enumerate(dataloader):
        inputs, labels = data

        temp = []
        for d1 in inputs:
            temp.append(sum([1 if d2[0] != 0 else 0 for d2 in d1]))
        lens = torch.LongTensor(temp)
        # 大きい順に並び替える
        lens, idx = lens.sort(0, descending=True)
        inputs = inputs[idx]
        labels = labels[idx]

        inputs, labels = Variable(inputs.float().cuda()), Variable(labels.cuda())
        
        net.batch_size = len(labels)
        optimizer.zero_grad()
        net.hidden = net.init_hidden()
        
        output = net(inputs, lens.tolist())
        
        loss = criterion(output, labels)
        epoch_loss += loss.data[0]
        loss.backward()
        optimizer.step()

    # ロスの表示
    print("===> Epoch[{}]: Loss: {:.4f}".format(epoch, epoch_loss))

===> Epoch[0]: Loss: 1.8081
===> Epoch[1]: Loss: 1.6085
===> Epoch[2]: Loss: 1.4709
===> Epoch[3]: Loss: 1.4013
===> Epoch[4]: Loss: 1.3998
===> Epoch[5]: Loss: 1.3114
===> Epoch[6]: Loss: 1.3363
===> Epoch[7]: Loss: 1.2934
===> Epoch[8]: Loss: 1.2239
===> Epoch[9]: Loss: 1.1971
===> Epoch[10]: Loss: 1.1496
===> Epoch[11]: Loss: 1.2017
===> Epoch[12]: Loss: 1.1313
===> Epoch[13]: Loss: 1.0645
===> Epoch[14]: Loss: 1.0932
===> Epoch[15]: Loss: 1.0192
===> Epoch[16]: Loss: 0.9817
===> Epoch[17]: Loss: 0.9589
===> Epoch[18]: Loss: 0.8938
===> Epoch[19]: Loss: 0.8337
===> Epoch[20]: Loss: 0.9431
===> Epoch[21]: Loss: 0.8259
===> Epoch[22]: Loss: 0.8045
===> Epoch[23]: Loss: 0.7304
===> Epoch[24]: Loss: 0.7186
===> Epoch[25]: Loss: 0.6778
===> Epoch[26]: Loss: 0.6763
===> Epoch[27]: Loss: 0.5879
===> Epoch[28]: Loss: 0.6312
===> Epoch[29]: Loss: 0.5570
===> Epoch[30]: Loss: 0.5553
===> Epoch[31]: Loss: 0.5821
===> Epoch[32]: Loss: 0.5140
===> Epoch[33]: Loss: 0.4342
===> Epoch[34]: Loss: 0.

In [6]:
dataset = MyDataset(test, testTag)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True, num_workers=8)

result = []
labelList = []
for i, data in enumerate(dataloader):
    inputs, labels = data

    temp = []
    for d1 in inputs:
        temp.append(sum([1 if d2[0] != 0 else 0 for d2 in d1]))
    lens = torch.LongTensor(temp)
    # 大きい順に並び替える
    lens, idx = lens.sort(0, descending=True)
    inputs = inputs[idx]
    labels = labels[idx]
    
    inputs, labels = Variable(inputs.float().cuda()), Variable(labels.cuda())
    
    optimizer.zero_grad()
    net.batch_size = len(labels)
    net.hidden = net.init_hidden()
    output = net(inputs, lens.tolist())
    
    # labelsを保存
    labelList.extend([l for l in labels])
    result.extend(output)

result = [torch.max(temp, 0)[1].data[0] for temp in result]

select = float(sum(result))
zenbu = float(sum(testTag))
seikai = 0.0
for a, b in zip(result, labelList):
    b = b.data[0]
    if (a == 1) and (b == 1) and (a == b):
        seikai += 1.0
pre = seikai / select
recall = seikai / zenbu
f1 = 2 * pre * recall / (pre + recall)
print("Precision: {} / {} = {}".format(seikai, select, pre))
print("Recall: {} / {} = {}".format(seikai, zenbu, recall))
print("F-measure: {}".format(f1))

Precision: 57.0 / 67.0 = 0.8507462686567164
Recall: 57.0 / 78.0 = 0.7307692307692307
F-measure: 0.786206896551724
