In [3]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [4]:
model_path = "../../corpus/sbert_stair"
# data_name = "hate_labeled.csv"

In [5]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
sbert = SentenceTransformer(model_path)

In [6]:
def text2vec(text):
    sentences = []
    if isinstance(text, str):
        sentences = [text]
    elif isinstance(text, list):
        sentences = text
    
    return sbert.encode(sentences)


In [7]:
def make_dataset(df, mode="All"):
    X = []
    y = []
    path = "../hand_labeled/"
    datalist = ['DCM', 'DIT', 'IRS']
    convs = read_conv(path, datalist)

    usr_utt = []
    for conv in convs:
        for i, ut in enumerate(conv):
            if not ut.is_system():
                usr_utt.append(clean_text(ut.utt))
    import random

    if mode=="All":
        for la, txt in zip(df.label, df.txt):
            # X.append( InputExample(guid="", texts=[txt], label=float(la) ) )
            X.append(txt)
            y.append(la)
        print("length of X", len(X))
        # 0 の要素を増やしておきますわよ
        sampled = random.sample(usr_utt, len(X)//3)
        for sample in sampled:
            # X.append( InputExample(guid="" , texts=[sample], label=0.0 ) )
            X.append(sample)
            y.append(0)
        print("added length of X", len(X))

    return X, y

In [8]:
corpus_path = "../../corpus/twitter/"
data_name = "hate_labeled.csv"
df = pd.read_csv(corpus_path+data_name)
X, y = make_dataset(df)

length of X 1138
added length of X 1517


In [9]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [10]:
X_train = text2vec(X_train_str)

In [11]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
# clf = AdaBoostClassifier()
clf = svm.SVC(kernel='rbf', gamma =0.0001, C=1000)

In [12]:
clf.fit(X_train, y_train)

SVC(C=1000, gamma=0.0001)

In [13]:
print(X_train_str[:5])
print(y_train[:5])

['実際あったら最高に頭悪いマザボだな', 'チャットをどれくらいしますか', 'たまに画質ゴミゴミになるのなに……', 'それなー、運営も頭悪い', '喧嘩に関しちゃあ……馬鹿じゃねえんだな']
[2, 0, 0, 2, 0]


In [14]:
X_test = text2vec(X_test_str)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
for y_p, x_s in zip(y_pred[:30], X_test_str[:30]):
    print("{0} : {1}".format(y_p, x_s))

1 : 昨日飲み過ぎて頭悪いの
2 : サムスンは死ね
1 : なんで俺馬鹿にされてるん？
3 : へをこいてはよう死ね
0 : 誰のシングルですか？
2 : 塾終わったらまた行きたいけど親多分いるあー死ね
2 : マスクゴミ動画量産女たちは韓国の整形美女軍団見習ってほしい
2 : 後払いなら減給じゃなくて草あいつ馬鹿だろ
3 : 死ね！
0 : このメンバーの飲み会馬鹿笑いできるから楽しい
0 : 君はお酒飲むのかい？
0 : こんにちは。私は野球観戦が好きですね。
0 : ボードですか、楽しそうですね
0 : 久々に絡みたかったので馬鹿にしてみました
3 : 馬鹿だが?
0 : 夏と冬どっちがいい？
2 : 本当山口俊って頭悪いんだよね
0 : 羨ましいです。私は行ったことが無いんで行ってみたいです。
2 : きめえんだよゴミコンクリート流し込んだろか
3 : 死ね沖田
0 : 最近は特保コーラにハマってます。特保飲料は飲みますか
0 : 他にどんな食べ物が好きですか？
1 : 馬鹿にされちゃう！！！！！！
0 : 馬鹿左翼必死やなぁ
1 : いやいや言い方変えると自分はあの時無謀で馬鹿な人です
0 : すごいですね！
0 : そうでしたっけ。バラエティー番組は見ていて楽しいから好きなんですよ。
3 : んな馬鹿な
0 : ありがとうございます 私達飼い主も親馬鹿ですがかわいいと思ってます
1 : あ、馬鹿だ。ツイ消ししちゃった…馬鹿だ…


In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[146   6   9   5]
 [  4  24  10   7]
 [  9   5  75  19]
 [ 10   6  24  97]]
accuracy =  0.75


In [18]:
import copy
def convert_bool(p):
    new_p = copy.deepcopy(p)
    for i, v in enumerate(p):
        if v<=1:
            new_p[i] = 0
        else:
            new_p[i] = 1
    return new_p

In [19]:
y_test_ = convert_bool(y_test)
y_pred_ = convert_bool(y_pred)

In [20]:
print('confusion matrix = \n', confusion_matrix(y_true=y_test_, y_pred=y_pred_))
print('accuracy = ', accuracy_score(y_true=y_test_, y_pred=y_pred_))
print('precision = ', precision_score(y_true=y_test_, y_pred=y_pred_))
print('recall = ', recall_score(y_true=y_test_, y_pred=y_pred_))
print('f1 score = ', f1_score(y_true=y_test_, y_pred=y_pred_))

confusion matrix = 
 [[180  31]
 [ 30 215]]
accuracy =  0.8662280701754386
precision =  0.8739837398373984
recall =  0.8775510204081632
f1 score =  0.8757637474541752


In [21]:
tests = ["知らんのかい", "お前馬鹿だろ", "センスないね", "雑魚がよ", "俺が馬鹿だった","馬鹿みたいに笑ったw", "消えろカス", "お前頭が悪いな", "あの国のやつらはクズ"]

In [22]:
vec = text2vec(tests)
test_res = convert_bool( clf.predict(vec) ) 

In [21]:
for y_p, x_s in zip(test_res, tests):
    print("{0} : {1}".format(y_p, x_s))

0 : 知らんのかい
1 : お前馬鹿だろ
0 : センスないね
0 : 雑魚がよ
0 : 俺が馬鹿だった
0 : 馬鹿みたいに笑ったw
1 : 消えろカス
1 : お前頭が悪いな
1 : あの国のやつらはクズ


In [None]:
# ここから学習データでテスト

In [2]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [29]:

error = "Lack of sociality"
# errors = ['Grammatical error', "Uninterpretable"]
sys_utt = []
y_ = []
for conv in convs:
    for ut in conv:
        if ut.is_system() and ut.is_exist_error():
            # if not ut.utt[-1] in ["？", "！", "。", "!"]:
            #     # sys_utt.append(ut.utt+"。")
            #     sys_utt.append(ut.utt)
            # else:   
            sys_utt.append(ut.utt)
            if ut.is_error_included(error):
                y_.append(1)
                # print(ut.utt)
            else:
                y_.append(0)


In [30]:
print("len of y:{0}, error '{1}' counts:{2}".format(len(y_), error, y_.count(1)))

len of y:1349, error 'Lack of sociality' counts:1


In [31]:
vec = text2vec(sys_utt)
y_pred_ = convert_bool( clf.predict(vec) ) 

In [32]:
print('confusion matrix = \n', confusion_matrix(y_true=y_, y_pred=y_pred_))
print('accuracy = ', accuracy_score(y_true=y_, y_pred=y_pred_))
print('precision = ', precision_score(y_true=y_, y_pred=y_pred_))
print('recall = ', recall_score(y_true=y_, y_pred=y_pred_))
print('f1 score = ', f1_score(y_true=y_, y_pred=y_pred_))

confusion matrix = 
 [[1314   34]
 [   1    0]]
accuracy =  0.9740548554484804
precision =  0.0
recall =  0.0
f1 score =  0.0


In [34]:
# for utt, a, b in zip(sys_utt, y_,  y_pred_):
#     if a==0 and b==1:
#         print(utt)

- 検出は不可能でしょこれは

        confusion matrix = 
        [[1314   34]
        [   1    0]]
        accuracy =  0.9740548554484804
        precision =  0.0
        recall =  0.0
        f1 score =  0.0