In [1]:
import os


import pprint
import random
import numpy as np
import pandas as pd

import spacy
import sys
sys.path.append("../")
from datatools.analyzer import *
# from datatools.analyzer import clean_text
from error_tools import *



In [2]:
from nltk.lm        import Vocabulary
# from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated
from nltk.util      import ngrams

In [3]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
error_types = ['Unclear intention', 'Wrong information',
 'Ignore question', 'Topic transition error', 
 'Lack of information', 'Repetition', 
 'Contradiction', 'Self-contradiction',
  'Lack of common sense', 'Semantic error',
   'Grammatical error', 'Ignore proposal', 
   'Ignore offer', 'Lack of sociality', 
   'Uninterpretable', 'Ignore greeting', 
   'No-Err']


In [4]:
convs = read_conv(path, datalist)

In [5]:
# usr_utt = []
# for conv in convs:
#     for i, ut in enumerate(conv):
#         if not ut.is_system():
#             usr_utt.append(clean_text(ut.utt))
            

In [6]:
from tqdm import tqdm
def extract_utt_nucc(path):
    files = os.listdir(path)
    nucc_convs = []
    for filename in tqdm(files):
        if ".json" not in filename:
            continue
        # name = filename.split(".")[0]
        with open(path+filename, "r") as f:
            data  = json.load(f)
            for conv in data["turns"]:
                utt = conv["utterance"]
                if len(nlp(utt)) < 2:
                    # print(utt)
                    continue
                nucc_convs.append(utt)
    return nucc_convs

In [49]:
# nuccデータ
nucc_path = "../../corpus/nucc/conv2/"
nucc_convs = extract_utt_nucc(nucc_path)

100%|██████████| 91/91 [08:17<00:00,  5.47s/it]


In [8]:
# usr_utt += nucc_convs

In [50]:
# mode phrase

phrase_data = "../../corpus/gogakuru/phrases.csv"
df = pd.read_csv(phrase_data)
corpus = list( df["phrase"].values ) + nucc_convs 


In [51]:
len(corpus)

109758

In [88]:
# filled_normal = fill_SYMBOL( sentence2normalize_nv(corpus) )
# filled_normal = fill_SYMBOL(sentence2normalize_independent(corpus) )
filled_normal = fill_SYMBOL( sentence2normalize_noun(corpus) )

In [89]:
def create_language_model(sentences, N):
    vocab = Vocabulary([word for sent in sentences for word in sent])
    text_ngrams = [ngrams(sent, N) for sent in sentences]
    lm = KneserNeyInterpolated(order=N, vocabulary=vocab)
    lm.fit(text_ngrams)
    return lm

In [107]:
n=3
lm = create_language_model(filled_normal, N=n)

In [109]:
prob_list = []
for word in lm.context_counts(lm.vocab.lookup(context)): # 文脈に続く単語一覧の取得
    prob_list.append((word, lm.score(word, context))) # 単語のその出現する確率を格納

prob_list.sort(key=lambda x: x[1], reverse=True) # 出現確率順にソート
for word, prob in prob_list:
    print('\t{:s}: {:f}'.format(word, prob))

In [125]:
import math
import pprint
def sentence2score(sentence, l, N):
    # filled = fill_SYMBOL( sentence2normalize_nv(sentence) )
    filled = fill_SYMBOL( sentence2normalize_noun(sentence) )
    # filled = fill_SYMBOL( sentence2normalize_independent(sentence) )
    filled_pos = fill_SYMBOL( sentence2pos(sentence) )
    print(filled)
    print(filled_pos)
    ngram_text = []
    ngram_pos = []

    function_score = 0
    # デフォルトで1
    function_num = 1

    for L, P in zip(filled,filled_pos):
        for i in range(len(L)-N+1):
            # print(L[i:i+N])
            ngram_text.append(L[i:i+N])
            ngram_pos.append(P[i:i+N])
    # pprint.pprint(ngram_text)
    all_score = 0
    for ngram, pgram in zip(ngram_text, ngram_pos):
        context = (ngram[:-1])
        context_pos = pgram[:-1]
        # print(context)
        # for word in lm.context_counts(lm.vocab.lookup(context)): # 文脈に続く単語一覧の取得
            
        score = lm.score(ngram[-1], context) + 1e-4
        
        log_score = math.log2(score)
        if "助動詞" in context_pos[1] or "助詞" in context_pos[1] or "助動詞" in context_pos[0] or "助詞" in context_pos[0]:
            print("\tcontext : {0}| ->".format(context), log_score)
            function_score += log_score
            function_num += 1
    # print(all_score/len(ngram_text))
    return function_score/function_num
    

In [127]:

# n=4
sentence = "最近とても暑いですから。"
sentence = "ご存知ですいます"
# sentence = "はい、そうですよ"
sentence2score(sentence, lm, N=n)

[['FOS', 'FOS', '名詞-普通名詞-一般', 'です', 'い', 'ます', 'EOS', 'EOS']]
[['FOS', 'FOS', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '助動詞', 'EOS', 'EOS']]
	context : ['名詞-普通名詞-一般', 'です']| -> -13.285736490564451
	context : ['です', 'い']| -> -12.3053448696396
	context : ['い', 'ます']| -> -10.39782365643794
	context : ['ます', 'EOS']| -> -0.01302851172990395


-7.20038670567438

In [112]:
sentence = "はい、そうですよ。"
sentence2score(sentence, lm, N=n)

[['FOS', 'FOS', 'はい', '、', '副詞', 'です', 'よ', '。', 'EOS', 'EOS']]
[['FOS', 'FOS', '感動詞-一般', '補助記号-読点', '副詞', '助動詞', '助詞-終助詞', '補助記号-句点', 'EOS', 'EOS']]
	context : ['副詞', 'です']| -> -12.3053448696396
	context : ['です', 'よ']| -> -1.0935841497164953
	context : ['よ', '。']| -> -0.005359257833236664


-3.3510720692973326

In [113]:
filled = fill_SYMBOL( sentence2normalize_noun(sentence) )

In [114]:
ngram_text = []
for L in filled:
    for i in range(len(L)-n+1):
            # print(L[i:i+N])
        ngram_text.append(L[i:i+n])
print(ngram_text)

[['FOS', 'FOS', 'はい'], ['FOS', 'はい', '、'], ['はい', '、', 'そう'], ['、', 'そう', 'です'], ['そう', 'です', 'よ'], ['です', 'よ', '。'], ['よ', '。', 'EOS'], ['。', 'EOS', 'EOS']]


In [115]:
lm.perplexity(ngram_text)

8.452979569439512

In [116]:
from datatools.maneger import DataManager

In [117]:
modelM = DataManager("../models/utterance/")
# model_name = "KLM_phrase_n={0}.pickle".format(n)
# model_name = "KLM_phrase_nucc_n={0}.pickle".format(n)
model_name = "KLM_phrase_nucc_n={0}_noun.pickle".format(n)

In [118]:

modelM.save_data(model_name, lm)

success save : ../models/utterance/KLM_phrase_nucc_n=3_noun.pickle


In [119]:
lm = modelM.load_data(model_name)

success load : ../models/utterance/KLM_phrase_nucc_n=3_noun.pickle


In [120]:
# errors = ['Grammatical error', "Uninterpretable"]

# y = []
# n=3
# y_pred = []
# for conv in convs:
#     for ut in conv:
#         if not ut.is_system():
#             continue
#         # エラーなら1
#         if ut.is_error_included(errors):
#             # print(ut)
#             y.append(1)
#         else:
#             y.append(0)
#         #LM 判定
#         # エラーなら1
#         if sentence2score(ut.utt, lm, N=n) < -5.6:
#             y_pred.append(1)
#         else:
#             y_pred.append(0)

        

In [121]:
# from sklearn import metrics
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
# print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))

# print('EM:', metrics.accuracy_score(y, y_pred))
# print('F-measure: ', metrics.f1_score(y, y_pred))

In [122]:
# for conv in convs:
#     for ut in conv:
#         if sentence2score(ut.utt, lm, N=3) < -5.5:
#             # print(ut.utt)
#             pass
#         else:
#             # print(ut.utt)
#             pass

In [123]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true=y, y_pred=y_pred)
sns.heatmap(cm, square=True, cbar=True, annot=True, cmap='Blues')
plt.savefig('sklearn_confusion_matrix.png')

NameError: name 'y' is not defined

In [None]:
print(lm.counts)

<NgramCounter with 3 ngram orders and 951263 ngrams>
