In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [4]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [27]:
from gensim.models import KeyedVectors

w2v_path = "../../corpus/w2v/"
# fasttext
# https://qiita.com/Hironsan/items/513b9f93752ecee9e670
w2v_name =  "dep-ja-300dim"
w2v_name =  "model.vec"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path+w2v_name)

In [48]:
wsv_dim = w2v_model["あ"].shape[0]
add_keys = ["FOS", "EOS", "[SEP]", "[UNK]", "[NONE]"]
add_weights = [np.random.randn(wsv_dim) for _ in range(len(add_keys))]
add_weights = [ v/np.linalg.norm(v) for v in add_weights ]
SYMBOL_w2v = dict(zip(add_keys, add_weights))

In [49]:
symbol_path = "../models/context_topic/"
symbol_name = "toyoshima_symbol.pickle"
symbolM = DataManager(symbol_path)
symbolM.save_data(symbol_name, SYMBOL_w2v)

success save : ../models/context_topic/toyoshima_symbol.pickle


In [50]:
def toyoshima_Xy_str(convs):
    errors = ["Topic transition error"]
    X_str = []
    y = []
    for conv in convs:
        for i, ut in enumerate( conv ) :
            if ut.is_system() and not ut.is_utt_level_error():
                X_str.append( [conv[i-1].utt, ut.utt]  )
                if ut.is_error_included(errors):
                    y.append(1)
                else:
                    y.append(0)
    return X_str, y

In [51]:
X_str, y = toyoshima_Xy_str(convs)

In [108]:
toyoshima_set = set("NOUN PROPN VERB ADJ".split())

def w2v(word, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    if word in SYMBOL_w2v:
        vector = SYMBOL_w2v[word]
    elif word in w2v_model:
        vector = w2v_model[word]
    else:
        vector = SYMBOL_w2v["[UNK]"]
    return vector

def filtering(doc, filter_set):
    left = []
    for token in doc:
        if token.pos_ in filter_set:
            left.append(token.lemma_)
    return left if len(left)>0 else ["[NONE]"]

def doc2vec(doc, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    left = filtering(doc, toyoshima_set)
    return np.mean([ w2v(w, w2v_model, SYMBOL_w2v) for w in left], axis=0)

# 副詞など，ほぼすべて
def doc2vec2(doc, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    left = filtering(doc, independent_set)
    return np.mean([ w2v(w, w2v_model, SYMBOL_w2v) for w in left], axis=0)

def sentence2formated(sen, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    docs = sentence2docs(sen, sents_span=False)
    vector = []
    for i, doc in enumerate(docs):
        if i==0:
            prev_vector = doc2vec2(doc, w2v_model, SYMBOL_w2v)
        else:
            current_vector = doc2vec2(doc, w2v_model, SYMBOL_w2v)
            diff_vec = np.abs(prev_vector-current_vector)
            norm = np.linalg.norm(diff_vec)
            if norm==0:
                norm = 1            
            vector.append( diff_vec/norm )
            prev_vector = current_vector
    return vector


In [109]:
from tqdm import tqdm
X = []
for x_str in tqdm( X_str ):
    feature = sentence2formated(x_str, w2v_model, SYMBOL_w2v)[0]
    X.append(feature)

100%|██████████| 1584/1584 [00:48<00:00, 32.83it/s]


In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.30, random_state=5, stratify=y)

In [125]:
from sklearn import svm
# clf = AdaBoostClassifier()
clf = svm.SVC(kernel='rbf', gamma =0.0001, C=100000)

In [126]:
clf.fit(X_train, y_train)

SVC(C=100000, gamma=0.0001)

In [127]:
y_pred = clf.predict(X_test)

In [128]:
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[397  29]
 [ 48   2]]
accuracy =  0.8382352941176471
precision =  0.06451612903225806
recall =  0.04
f1 score =  0.04938271604938271


In [120]:
y.count(1)

166