In [37]:
import os
import json
import copy
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [38]:
import sys
sys.dont_write_bytecode = True
sys.path.append('../datatools')
from maneger import DataManager
from preproc import Preprocessor
# from utterance.feature import Feature
from feature import Feature

In [39]:
pre = Preprocessor()

300


In [40]:
import spacy
nlp = spacy.load('ja_ginza')

def get_POS(sen):
    pos_list = []
    if isinstance(sen, str):
        doc = nlp(sen)
        texts = [str(s)  for s in doc.sents]

    elif isinstance(sen, list):
        texts = []
        docs = list(nlp.pipe(sen, disable=['ner']))
            # return [ self.get_POS(sen_) for sen_ in sen]
        for doc in docs:
            texts.extend( [str(s) for s in doc.sents] )

    else:
        return None
    docs = list(nlp.pipe(texts, disable=['ner']))
    for doc in docs:
        pos_list.append([ token.tag_ for token in doc ])
        
    return pos_list

def noun2normal(sen):
    normalize_sen = []
    docs = list(nlp(sen).sents)
    # docs = list( nlp.pipe( list(nlp(sen).sents), disable=['ner'] ) )
    # print(docs)
    for doc in docs:
        normalize_sen.append( [ token.tag_ if "名詞" in token.tag_ else token.lemma_ for token in doc ] )
    return normalize_sen

independent_words = set("名詞 代名詞 動詞 形容詞 副詞 接続詞 感動詞 連体詞 形状詞".split() )
def independent2normal(sen):
    normalize_sen = []
    docs = list(nlp(sen).sents)
    # docs = list( nlp.pipe( list(nlp(sen).sents), disable=['ner'] ) )
    # print(docs)
    for doc in docs:
        words = []
        for token in doc:
            tag = token.tag_.split("-")[0]
            if tag in independent_words:
                    # print(token.lemma_)
                words.append(token.tag_)
            else:
                words.append(token.lemma_)
        normalize_sen.append(words)
    return normalize_sen

In [1]:
# 学習データ準備
def json2data(path):
    cols = ["text", "label", "subLabel"]
    df = pd.DataFrame(index=[], columns=cols)
    files = os.listdir(path)
    for cop in files:
        if "." not in cop:
            continue
        with open(path+cop, "r") as f:
            json_data = json.load(f)
            mode = cop.split(".")[0]
            max_ = 300
            for i, data in enumerate( json_data[mode] ) :
                if i > max_:
                    break
                text = data["data"]
                label = data["label"][0]
                # if label == "plain":
                #     break
                subLabel = ""
                df = df.append(pd.DataFrame([text, label, subLabel], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

label_list = "YN WH please proposal plain".split()
label_dict = dict( zip(label_list, range(len(label_list))) )

def extract_X_y(df):
    X = []
    y = []
    for te, la in zip(df.text, df.label):
        X.append(te)
        y.append( torch.tensor( label_dict[la], dtype=torch.long ) )
        # y.append(torch.eye(len(label_dict), dtype=torch.long)[label_dict[la]])
    return X, y

In [42]:
corpus_root = "../../corpus"
# name = "question/short"
name = "question"
data_path = "/".join([corpus_root, name]) + "/"
data_path

'../../corpus/question/'

In [43]:
df = json2data(data_path)
df

Unnamed: 0,text,label,subLabel
0,メニューを見せていただけますか？,please,
1,おいでいただけますか？,please,
2,マッシュポテトをもらえますか？,please,
3,伝言を預かっていただけますか？,please,
4,ご一緒しませんか？,please,
...,...,...,...
1236,メンバーになりたいかい？,YN,
1237,明日スタートなさいますか？,YN,
1238,トレイがいりますか？,YN,
1239,2階も見たいですか？,YN,


In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

In [45]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [46]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, batch_size):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim,  padding_idx=0)
        # self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True, bidirectional=True )
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        # self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        # self = self.to(self.device)
        self.softmax = nn.LogSoftmax()
    
    def forward(self, x):
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        embeds = self.word_embeddings(x)
        # batch_size, seq_len = x.shape[0], x.shape[1]
        # _, lstm_out = self.lstm(embeds.view(len(x), 1, -1))
        _, lstm_out = self.lstm(embeds)
        # print(hidden_layer)
        # bilstm_out = torch.cat([lstm_out[0][0], lstm_out[0][1]], dim=1)
        tag_space = self.hidden2tag(lstm_out[0])
        # y = self.hidden2tag(hidden_layer[0].view(batch_size, -1))

        # y = self.hidden2tag(bilstm_out)
        y =self.softmax(tag_space.squeeze())
        return y

In [47]:
pos_preset = [
    "名詞-普通名詞-一般",
    "名詞-普通名詞-サ変可能" ,
    "名詞-普通名詞-形状詞可能" ,
    "名詞-普通名詞-サ変形状詞可能" ,
    "名詞-普通名詞-副詞可能",
    "名詞-普通名詞-助数詞可能",
    "名詞-固有名詞-一般",
    "名詞-固有名詞-人名-一般"
    "名詞-固有名詞-人名-姓",
    "名詞-固有名詞-人名-名",
    "名詞-固有名詞-地名-一般",
    "名詞-固有名詞-地名-国",
    "名詞-数詞",
    "名詞-助動詞語幹",
    "代名詞",

    "形状詞-一般",
    "形状詞-タリ",
    "形状詞-助動詞語幹",

    "連体詞",
    "副詞",
    "接続詞",

    "感動詞-一般" ,
    "感動詞-フィラー" ,

    "動詞-一般" ,
    "動詞-非自立可能",

    "形容詞-一般",
    "形容詞-非自立可能",

    "助動詞",

    "助詞-格助詞",
    "助詞-副助詞",
    "助詞-係助詞",
    "助詞-接続助詞",
    "助詞-終助詞",
    "助詞-準体助詞",

    "接頭辞",
    "接頭辞",
    "接尾辞-名詞的-サ変可能",
    "接尾辞-名詞的-形状詞可能",
    "接尾辞-名詞的-サ変形状詞可能",
    "接尾辞-名詞的-副詞可能",
    "接尾辞-名詞的-助数詞",
    "接尾辞-形状詞的",
    "接尾辞-動詞的",
    "接尾辞-形容詞的",

    "記号-一般",
    "記号-文字",

    "補助記号-一般",
    "補助記号-句点",
    "補助記号-読点",
    "補助記号-括弧開",
    "補助記号-括弧閉",
    "補助記号-ＡＡ-一般",
    "補助記号-ＡＡ-顔文字",
    "空白",
]

def make_word_dict(X):
    word_dict = {"pad": 0}
    for p in pos_preset:
        word_dict[p] = len(word_dict)

    for x in X:
        normal = noun2normal(x)
        for nor in normal:
            for word in nor:
                if word in word_dict.keys():
                    continue
                word_dict[word] = len(word_dict)
    return word_dict
    

In [48]:
def sentence2number(sen, word_dict):
    normal = sum( noun2normal(sen),[] )
    pos = sum( get_POS(sen),[] )
    return [ word_dict[w] if w in word_dict else word_dict[p] for w, p in zip(normal, pos)]    

In [49]:
import torch.nn.utils.rnn as rnn
def make_Xseq(X, word_dict):
    max_len = 0
    Xseq = [ np.array( sentence2number(x, word_dict) ) for x in X ]
    Xseq = [ torch.tensor( xseq[:, None] ) for xseq in Xseq]
    Xseq = rnn.pad_sequence(Xseq, batch_first=True)
    Xseq = [ torch.flatten(xseq) for xseq in Xseq ]
    return Xseq

In [50]:
X, y = extract_X_y(df)

In [51]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [52]:
word_dict = make_word_dict(X_train_str)

In [53]:
X_train = make_Xseq(X_train_str, word_dict)

In [54]:
leng = len(y_train)
print(leng)
for i, v in enumerate(y_train):
    if leng %(i+1) == 0:
        print(i+1, end=", ")

868
1, 2, 4, 7, 14, 28, 31, 62, 124, 217, 434, 868, 

In [55]:
BATCH_SIZE = 62
epoch_ = 600
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [56]:
Vocab_size = len(word_dict)
EMBEDDING_DIM = 100
HIDDEN_DIM = EMBEDDING_DIM*2
OUTPUT_DIM = len(label_dict)

In [57]:
model = LSTMClassifier(Vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, BATCH_SIZE)
if torch.cuda.is_available():
   model.cuda()
# loss_function = nn.NLLLoss()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0002)

In [60]:
losses = []
loss_border = 0.0001
# print("error[{0}]".format(error_types[error_i]))
for epoch in range(epoch_):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for data in trainloader:
        X_t_tensor = torch.tensor(data[0], device='cuda:0')
        print(X_t_tensor.shape , y_t_tensor.view(-1,1).shape)
        # y_t_tensor = torch.tensor(data[1].reshape(batch_size, 1), device='cuda:0').float()
        y_t_tensor = torch.tensor(data[1], device='cuda:0')
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape)

        score = model(X_t_tensor)
        print(X_t_tensor.shape, score.view(-1,5).shape, y_t_tensor.view(-1,1).shape)
        loss_ = loss_function(score.view(-1,5),  y_t_tensor)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
    # if all_loss <= loss_border:
    #     print("loss was under border(={0}) : train end".format(loss_border))
    #     break
print("done")

  import sys
  if __name__ == '__main__':


RuntimeError: mat1 dim 1 must match mat2 dim 0

In [None]:
from matplotlib import pyplot as plt
def plot_history(losses):
    fig, ax = plt.subplots()

    epochs = np.arange(1, len(losses) + 1)

    # 損失の推移
    ax.set_title("Loss")
    ax.plot(epochs, losses)
    ax.set_xlabel("Epoch")

    plt.show()
plot_history(losses)

In [None]:
X_test = make_Xseq(X_test_str, word_dict)

In [None]:
# X_test = make_Xseq(X_test_str, word_dict)
with torch.no_grad():
    # X_tensor = torch.tensor( X_test, device='cuda:0')
    X_tensor = [t.to('cpu').detach().numpy().copy() for t in X_test ]
    X_tensor = torch.tensor( X_tensor, device='cuda:0')
#     y_tensor = torch.tensor(y_test, dtype=torch.long, device='cuda:0')
            # 推論
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

In [None]:
metrics.accuracy_score(y_test, y_pred)