In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [2]:
model_path = "../../corpus/sbert_stair2"
# data_name = "hate_labeled.csv"

In [3]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
sbert = SentenceTransformer(model_path)

[5100] 2022-01-12 20:48:04,318 Info sentence_transformers.SentenceTransformer :Load pretrained SentenceTransformer: ../../corpus/sbert_stair2
[5100] 2022-01-12 20:48:07,127 Info sentence_transformers.SentenceTransformer :Use pytorch device: cuda


In [4]:
def text2vec(text):
    sentences = []
    if isinstance(text, str):
        sentences = [text]
    elif isinstance(text, list):
        sentences = text
    
    return sbert.encode(sentences)


In [5]:
la2la = {0:0,1:0, 2:1, 3:1}

In [6]:
def make_dataset(df, mode="All"):
    X = []
    y = []
    path = "../hand_labeled/"
    datalist = ['DCM', 'DIT', 'IRS']
    convs = read_conv(path, datalist)

    usr_utt = []
    for conv in convs:
        for i, ut in enumerate(conv):
            if not ut.is_system():
                usr_utt.append(clean_text(ut.utt))
    import random

    if mode=="All":
        for la, txt in zip(df.label, df.txt):
            # X.append( InputExample(guid="", texts=[txt], label=float(la) ) )
            X.append(txt)
            # y.append(la)
            y.append( la2la[la] )
        print("length of X", len(X))
        # 0 の要素を増やしておきますわよ
        sampled = random.sample(usr_utt, len(X)//3)
        for sample in sampled:
            # X.append( InputExample(guid="" , texts=[sample], label=0.0 ) )
            X.append(sample)
            y.append(0)
        print("added length of X", len(X))

    return X, y

In [7]:
corpus_path = "../../corpus/twitter/"
# data_name = "hate_labeled.csv"
data_name = "impolite.csv"
df = pd.read_csv(corpus_path+data_name)


In [8]:
max_ = 1100
df = df[:max_]

In [9]:
X, y = make_dataset(df)

length of X 1100
added length of X 1466


In [10]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [11]:
X_train = text2vec(X_train_str)

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

In [13]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [14]:
class SocialModel(nn.Module):
    def __init__(self, embedding_dim, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(SocialModel, self).__init__()    
        self.embedding_dim = embedding_dim
        self.hid1= embedding_dim*2
        self.hid2 = embedding_dim//2
        self.fc1 = nn.Linear(self.embedding_dim, self.hid1)
        self.fc2 = nn.Linear(self.hid1, self.hid2)
        self.hidden2tag = nn.Linear(self.hid2, tagset_size)
    
    def forward(self, x):
        y = F.relu(self.fc1(x))
        y = F.relu(self.fc2(y))
        y = self.hidden2tag( y )
        y = F.log_softmax(y, dim=1)
        return y

In [15]:
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn import svm
# # clf = AdaBoostClassifier()
# clf = svm.SVC(kernel='rbf', gamma =0.0001, C=1000)

In [16]:
# clf.fit(X_train, y_train)

In [17]:
BATCH_SIZE = 64
epoch_ = 300
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [18]:
EMBEDDING_DIM = 768
HIDDEN_DIM = EMBEDDING_DIM*2
OUTPUT_DIM = 2
# seq_len = length
print(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

768 1536 2


In [19]:
model = SocialModel(EMBEDDING_DIM, OUTPUT_DIM)
if torch.cuda.is_available():
   model.cuda()
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [21]:
losses = []
loss_border = 0.0001
for epoch in range(epoch_): 
    all_loss = 0
    for data in trainloader:
        # X_t_tensor = torch.tensor(data[0], device='cuda:0')
        X_t_tensor = data[0].cuda()
        # y_t_tensor = torch.tensor(data[1], device='cuda:0')
        y_t_tensor = data[1].cuda()
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape)
        score_ = model(X_t_tensor)
        loss_ = loss_function(score_, y_t_tensor)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score_
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
    # if all_loss <= loss_border:
    #     print("loss was under border(={0}) : train end".format(loss_border))
    #     break
print("done")

epoch 50 	 loss 0.4861138644628227
epoch 100 	 loss 0.03653045691316947
epoch 150 	 loss 0.009939842857420444
epoch 200 	 loss 0.003859727649796696
epoch 250 	 loss 0.002347189947613515
epoch 300 	 loss 0.0009911896995618008
done


In [22]:
print(X_train_str[:5])
print(y_train[:5])

['が、こけた早苗ちゃんと同類のようにね!', '起きてるなら付き合え', 'ラインの無料漫画を読んでいます', 'ウザ', 'おいも片付けていけや']
[1, 1, 0, 1, 1]


In [23]:
X_test = text2vec(X_test_str)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

In [None]:
# y_pred = clf.predict(X_test)

In [24]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
            # 推論
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

In [25]:
for y_p, x_s in zip(y_pred[:30], X_test_str[:30]):
    print("{0} : {1}".format(y_p, x_s))

0 : 最近俺で遊びすぎWW
1 : おみげのディスやべーやん
1 : まとめてぶっ飛ばす
1 : 寧ろ地獄を見るのは安倍晋三と与党の屑、ぢゃい
0 : やってませんよ。でもジャズも好きです。
1 : お前のせいだからな
1 : 調べるとフラウンダーを鰈とする所もあって、鰈と鮃区別つけてないやろってなってる
0 : 空耳では?
0 : 沖縄行きたいです!
0 : どーしたの．同じくらい好きだよ
1 : 絶対半年に0回しか遊んでくれん
1 : ずるだったん知らんかった
0 : へー。最近のお気に入りは煮干し系ラーメンです。
1 : お前が落ち着きなぁ!
1 : お前が金出せよ。
1 : がんばれ
1 : なんて!幸せになれよ!クソ野郎!
1 : 私に怒って当たるな!!お前のせいだろうが!!
0 : 観光地の食べ物は高いですもんね。
1 : ええて
0 : 盛大にやってんなW
0 : 海いいですね
0 : 今日の予定は?
0 : 君の方が大事だから死ぬなよ…
0 : 巻き込むなよ
1 : 食えない俺より、一々そんな反応するの貧乏臭さの方が滑稽だわ
0 : 夜もバロすんの?
1 : お前のせいなんだぞッ!!
1 : プリ撮ったんなら見せろ
1 : 落ち着けって


In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[190  25]
 [ 16 209]]
accuracy =  0.9068181818181819


In [28]:
def score(test, pred):
    if len(collections.Counter(pred)) <= 2:
        print('confusion matrix = \n', confusion_matrix(y_true=test, y_pred=pred))
        print('accuracy = ', accuracy_score(y_true=test, y_pred=pred))
        print('precision = ', precision_score(y_true=test, y_pred=pred))
        print('recall = ', recall_score(y_true=test, y_pred=pred))
        print('f1 score = ', f1_score(y_true=test, y_pred=pred))
    else:
        print('confusion matrix = \n', confusion_matrix(y_true=test, y_pred=pred))
        print('accuracy = ', accuracy_score(y_true=test, y_pred=pred))

In [29]:

score(y_test, y_pred)

confusion matrix = 
 [[190  25]
 [ 16 209]]
accuracy =  0.9068181818181819
precision =  0.8931623931623932
recall =  0.9288888888888889
f1 score =  0.9106753812636166


-  SVM 

        confusion matrix = 
        [[185  30]
        [ 34 191]]
        accuracy =  0.8545454545454545

- FFN

         [[190  25]
        [ 16 209]]
        accuracy =  0.9068181818181819
        precision =  0.8931623931623932
        recall =  0.9288888888888889
        f1 score =  0.9106753812636166

In [41]:
import copy
def convert_bool(p):
    new_p = copy.deepcopy(p)
    for i, v in enumerate(p):
        if v<=1:
            new_p[i] = 0
        else:
            new_p[i] = 1
    return new_p

def convert_bool2(p):
    new_p = copy.deepcopy(p)
    return new_p

In [42]:
y_test_ = convert_bool2(y_test)
y_pred_ = convert_bool2(y_pred)

In [43]:
print('confusion matrix = \n', confusion_matrix(y_true=y_test_, y_pred=y_pred_))
print('accuracy = ', accuracy_score(y_true=y_test_, y_pred=y_pred_))
print('precision = ', precision_score(y_true=y_test_, y_pred=y_pred_))
print('recall = ', recall_score(y_true=y_test_, y_pred=y_pred_))
print('f1 score = ', f1_score(y_true=y_test_, y_pred=y_pred_))

confusion matrix = 
 [[185  30]
 [ 34 191]]
accuracy =  0.8545454545454545
precision =  0.8642533936651584
recall =  0.8488888888888889
f1 score =  0.8565022421524665


In [31]:
tests = ["知らんのかい", "お前馬鹿だろ", "センスないね", "雑魚がよ", "俺が馬鹿だった","馬鹿みたいに笑ったw", "消えろカス", "お前頭が悪いな", "あの国のやつらはクズ"]

In [32]:
vec = text2vec(tests)
# test_res = convert_bool2( clf.predict(vec) )
with torch.no_grad():
    X_tensor = torch.tensor(vec,  device='cuda:0').float()
            # 推論
    test_res = np.array(model(X_tensor).cpu()).argmax(axis=1) 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [33]:
for y_p, x_s in zip(test_res, tests):
    print("{0} : {1}".format(y_p, x_s))

1 : 知らんのかい
1 : お前馬鹿だろ
0 : センスないね
1 : 雑魚がよ
1 : 俺が馬鹿だった
1 : 馬鹿みたいに笑ったw
1 : 消えろカス
1 : お前頭が悪いな
1 : あの国のやつらはクズ


In [None]:
# ここから学習データでテスト

In [35]:
path = "../eval_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [36]:

error = "Lack of sociality"
# errors = ['Grammatical error', "Uninterpretable"]
sys_utt = []
y_ = []
for conv in convs:
    for ut in conv:
        if ut.is_system() and ut.is_exist_error():
            # if not ut.utt[-1] in ["？", "！", "。", "!"]:
            #     # sys_utt.append(ut.utt+"。")
            #     sys_utt.append(ut.utt)
            # else:   
            sys_utt.append(ut.utt)
            if ut.is_error_included(error):
                y_.append(1)
                # print(ut.utt)
            else:
                y_.append(0)


In [37]:
print("len of y:{0}, error '{1}' counts:{2}".format(len(y_), error, y_.count(1)))

len of y:1386, error 'Lack of sociality' counts:7


In [38]:
vec = text2vec(sys_utt)

# y_pred_ = convert_bool2( clf.predict(vec) ) 
with torch.no_grad():
    X_tensor = torch.tensor(vec,  device='cuda:0').float()
            # 推論
    y_pred_ = np.array(model(X_tensor).cpu()).argmax(axis=1) 

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

In [39]:
print('confusion matrix = \n', confusion_matrix(y_true=y_, y_pred=y_pred_))
print('accuracy = ', accuracy_score(y_true=y_, y_pred=y_pred_))
print('precision = ', precision_score(y_true=y_, y_pred=y_pred_))
print('recall = ', recall_score(y_true=y_, y_pred=y_pred_))
print('f1 score = ', f1_score(y_true=y_, y_pred=y_pred_))

confusion matrix = 
 [[1329   50]
 [   3    4]]
accuracy =  0.9617604617604618
precision =  0.07407407407407407
recall =  0.5714285714285714
f1 score =  0.13114754098360656


In [46]:
for utt, a, b in zip(sys_utt, y_,  y_pred_):
    if a==1 and b==1:
        print(utt)

ようよう
分かったからそう急かすな
おはよん。
なんで？


In [47]:
model_path = "../models/social/"
model_name = "impolite.pickle"
modelM = DataManager(model_path)
modelM.save_data(model_name, model)

success save : ../models/social/impolite.pickle


- 検出は不可能でしょこれは

        confusion matrix = 
        [[1314   34]
        [   1    0]]
        accuracy =  0.9740548554484804
        precision =  0.0
        recall =  0.0
        f1 score =  0.0