In [1]:

import pandas as pd
import os


In [2]:
def load_PN_noun_data():
    pn_path = "~/Documents/MMI/corpus/PN/"
    pd.set_option('display.unicode.east_asian_width', True)
    df = pd.read_csv(pn_path+"noun_pn", names=['word', 'label', 'desc'], delimiter="\t")
    df = df[ (df.label=="p") | (df.label=="n") | (df.label=="e") ]
    df["label"] = df["label"].replace({'p':2, 'e':1, 'n':0})

    return df

In [3]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor



In [4]:
df_noun = load_PN_noun_data()

In [5]:
from gensim.models import KeyedVectors

w2v_path = "../../corpus/w2v/"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path+ "dep-ja-300dim")

In [6]:
denial_list = "無 非　未　不".split()
def is_in_denimal(word):
    for denial in denial_list:
        if denial == word[0]:
            return True
    return False
print(denial_list)

['無', '非', '未', '不']


In [7]:
def make_Xy_pn(df):
    X = []
    y = []
    
    for x, la in zip(df.word, df.label):

        if x not in w2v_model:
            continue
        else:
            # if is_in_denimal(x):
            #     print(x)

            X.append( (x,w2v_model[x] ) )
            y.append(la)
    
    return X, y

In [8]:
X, y = make_Xy_pn(df_noun)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

In [10]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [26]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

In [27]:
leng = len(y_train)
print(leng)
for i, v in enumerate(y_train):
    if leng %(i+1) == 0:
        print(i+1, end=", ")

5890
1, 2, 5, 10, 19, 31, 38, 62, 95, 155, 190, 310, 589, 1178, 2945, 5890, 

In [28]:
X_train = torch.tensor( [xt[1] for xt in X_train_str] ) 
X_test =  torch.tensor( [xt[1] for xt in X_test_str]) 
y_train = torch.tensor( y_train ) 
y_test = torch.tensor( y_test ) 

In [35]:
BATCH_SIZE = 155
epoch_ = 1000
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [36]:
class PNModel(nn.Module):
    def __init__(self, embedding_dim, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(PNModel, self).__init__()    
        self.embedding_dim = embedding_dim
        self.hid1= embedding_dim*2
        self.hid2 = embedding_dim//2
        # self.fb_dim = 4
        # self.fb_dim = 0
        self.fc1 = nn.Linear(self.embedding_dim, self.hid1)
        self.fc2 = nn.Linear(self.hid1, self.hid2)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        # self.hidden2tag = nn.Linear(self.hid2+self.fb_dim, tagset_size)
        self.hidden2tag = nn.Linear(self.hid2, tagset_size)
    
    def forward(self, x):
        # fb = x[:, :self.fb_dim]
        y = F.relu(self.fc1(x))
        # y = F.relu(self.fc1(x[]))
        y = F.relu(self.fc2(y))
        # print(x.shape, torch.cat( (y, fb), 1 ).shape)
        # y = self.hidden2tag( torch.cat( (y, fb), 1 ) )
        y = self.hidden2tag( y )
        y = F.log_softmax(y, dim=1)
        return y

In [37]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 300*6
OUTPUT_DIM = 3
# seq_len = length
print(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

300 1800 3


In [38]:
model = PNModel(EMBEDDING_DIM, OUTPUT_DIM)
if torch.cuda.is_available():
   model.cuda()
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [39]:
losses = []
loss_border = 0.0001
for epoch in range(epoch_): 
    all_loss = 0
    for data in trainloader:
        # X_t_tensor = torch.tensor(data[0], device='cuda:0')
        X_t_tensor = data[0].cuda()
        # y_t_tensor = torch.tensor(data[1], device='cuda:0')
        y_t_tensor = data[1].cuda()
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape)
        score = model(X_t_tensor)
        loss_ = loss_function(score, y_t_tensor)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
    # if all_loss <= loss_border:
    #     print("loss was under border(={0}) : train end".format(loss_border))
    #     break
print("done")

epoch 50 	 loss 22.01689076423645
epoch 100 	 loss 16.792958468198776
epoch 150 	 loss 12.68260794878006
epoch 200 	 loss 9.59611365199089
epoch 250 	 loss 7.275226570665836
epoch 300 	 loss 5.419858634471893
epoch 350 	 loss 4.00537133961916
epoch 400 	 loss 2.9183077178895473
epoch 450 	 loss 2.014422671869397
epoch 500 	 loss 1.3555453978478909
epoch 550 	 loss 0.8976700995117426
epoch 600 	 loss 0.5572715671733022
epoch 650 	 loss 0.3495019224938005
epoch 700 	 loss 0.2509289081208408
epoch 750 	 loss 0.17405779287219048
epoch 800 	 loss 0.12703944626264274
epoch 850 	 loss 0.09387034183600917
epoch 900 	 loss 0.10083987005054951


KeyboardInterrupt: 

In [None]:
torch.cuda.is_available()

In [40]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
    y_tensor = torch.tensor(y_test, dtype=torch.long, device='cuda:0')
            # 推論
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[331  94  37]
 [ 91 378 121]
 [ 33 124 264]]
accuracy =  0.6605566870332654
