In [2]:
import os


import pprint
import random
import numpy as np
import pandas as pd

import spacy
import sys
sys.path.append("../")
from datatools.analyzer import *
# from datatools.analyzer import clean_text
from error_tools import *

In [3]:
from nltk.lm        import Vocabulary
# from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated
from nltk.util      import ngrams

In [4]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
error_types = ['Unclear intention', 'Wrong information',
 'Ignore question', 'Topic transition error', 
 'Lack of information', 'Repetition', 
 'Contradiction', 'Self-contradiction',
  'Lack of common sense', 'Semantic error',
   'Grammatical error', 'Ignore proposal', 
   'Ignore offer', 'Lack of sociality', 
   'Uninterpretable', 'Ignore greeting', 
   'No-Err']


In [5]:
convs = read_conv(path, datalist)

In [6]:
conv_utt = []
for conv in convs:
    for i, ut in enumerate(conv):
        if not ut.is_exist_error():
            conv_utt.append(clean_text(ut.utt))
            

In [7]:
from tqdm import tqdm
def extract_utt_nucc(path):
    files = os.listdir(path)
    nucc_convs = []
    for filename in tqdm(files):
        if ".json" not in filename:
            continue
        # name = filename.split(".")[0]
        with open(path+filename, "r") as f:
            data  = json.load(f)
            for conv in data["turns"]:
                utt = conv["utterance"]
                if len(nlp(utt)) < 2:
                    # print(utt)
                    continue
                nucc_convs.append(clean_text(utt))
    return nucc_convs

In [8]:
# nuccデータ
nucc_path = "../../corpus/nucc/conv2/"
nucc_convs = extract_utt_nucc(nucc_path)

100%|██████████| 91/91 [08:48<00:00,  5.81s/it]


In [9]:
# mode phrase

# phrase_data = "../../corpus/gogakuru/phrases.csv"
# df = pd.read_csv(phrase_data)
# corpus = list( df["phrase"].values ) + nucc_convs 


In [10]:
def load_utt_ntt():
    ntt_path = "../../corpus/NTT/"

    # with open(ntt_path+"empathetic.json", "r") as f:
    #     empathetic = json.load(f)
    # with open(ntt_path+"persona.json", "r") as f:
    #     persona = json.load(f)

    utt_list = []

    for file_ in os.listdir(ntt_path):
        if not "json" in file_:
            continue 
        with open(ntt_path+file_, "r",  encoding="utf-8") as f:
            convs = json.load(f)
            for did in convs["convs"]:
                dids = list( did.keys() )[0]
                conv = did[dids]
                # conv = did[dids][3::3]
                utt_list.extend( [ clean_text(utt)  for utt in conv])
    
    print(len(utt_list))
    return utt_list


In [11]:
ntt_utt = load_utt_ntt()

141777


In [12]:
corpus = conv_utt + nucc_convs + ntt_utt

In [13]:
print("corpus:{0}, conv:{1}, nucc:{2}, ntt:{3}".format(len(corpus), len(conv_utt), len(nucc_convs), len(ntt_utt)))

corpus:188013, conv:2851, nucc:43385, ntt:141777


In [14]:
# filled_normal = fill_SYMBOL( sentence2normalize_nv(corpus) )
# filled_normal = fill_SYMBOL(sentence2normalize_independent(corpus) )
filled_normal = fill_SYMBOL( sentence2normalize_noun(corpus) )
# filled_normal  = fill_SYMBOL( sentence2morpheme(corpus) )

In [15]:
def create_language_model(sentences, N):
    vocab = Vocabulary([word for sent in sentences for word in sent])
    text_ngrams = [ngrams(sent, N) for sent in sentences]
    lm = KneserNeyInterpolated(order=N, vocabulary=vocab)
    lm.fit(text_ngrams)
    return lm

In [16]:
n=4
lm = create_language_model(filled_normal, N=n)

In [17]:
# prob_list = []
# for word in lm.context_counts(lm.vocab.lookup(context)): # 文脈に続く単語一覧の取得
#     prob_list.append((word, lm.score(word, context))) # 単語のその出現する確率を格納

# prob_list.sort(key=lambda x: x[1], reverse=True) # 出現確率順にソート
# for word, prob in prob_list:
#     print('\t{:s}: {:f}'.format(word, prob))

In [18]:
import math
import pprint
def sentence2score(sentence, l, N):
    # filled = fill_SYMBOL( sentence2normalize_nv(sentence) )
    filled = fill_SYMBOL( sentence2normalize_noun(sentence) )
    # filled = fill_SYMBOL( sentence2morpheme(sentence) )
    # filled = fill_SYMBOL( sentence2normalize_independent(sentence) )
    filled_pos = fill_SYMBOL( sentence2pos(sentence) )
    print(filled)
    print(filled_pos)
    ngram_text = []
    ngram_pos = []

    function_score = 0
    # デフォルトで1
    function_num = 1

    for L, P in zip(filled,filled_pos):
        for i in range(len(L)-N+1):
            # print(L[i:i+N])
            ngram_text.append(L[i:i+N])
            ngram_pos.append(P[i:i+N])
    # pprint.pprint(ngram_text)
    all_score = 0
    for ngram, pgram in zip(ngram_text, ngram_pos):
        context = (ngram[:-1])
        context_pos = pgram[:-1]
        # print(context)
        # for word in lm.context_counts(lm.vocab.lookup(context)): # 文脈に続く単語一覧の取得
            
        score = lm.score(ngram[-1], context) + 1e-10
        
        log_score = math.log2(score)
        if "助動詞" in context_pos[1] or "助詞" in context_pos[1] or "助動詞" in context_pos[0] or "助詞" in context_pos[0]:
            print("\tcontext : {0}| ->".format(context), log_score)
            function_score += log_score
            function_num += 1
    print(all_score/len(ngram_text))
    # return function_score/function_num
    

In [19]:

# n=4
sentence = "最近とても暑いですから。"
sentence = "ご存知ですいます"
# sentence = "はい、そうですよ"
sentence2score(sentence, lm, N=n)

[['FOS', 'FOS', '名詞-普通名詞-一般', 'です', 'い', 'ます', 'EOS', 'EOS']]
[['FOS', 'FOS', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '助動詞', 'EOS', 'EOS']]
	context : ['名詞-普通名詞-一般', 'です', 'い']| -> -0.15199125175478748
	context : ['です', 'い', 'ます']| -> -17.046528996520383
	context : ['い', 'ます', 'EOS']| -> -0.0009431763729459997
0.0


In [20]:
sentence = "はい、そうですよ。"
sentence2score(sentence, lm, N=n)

[['FOS', 'FOS', 'はい', '、', 'そう', 'です', 'よ', '。', 'EOS', 'EOS']]
[['FOS', 'FOS', '感動詞-一般', '補助記号-読点', '副詞', '助動詞', '助詞-終助詞', '補助記号-句点', 'EOS', 'EOS']]
	context : ['そう', 'です', 'よ']| -> -3.4860704153263287
	context : ['です', 'よ', '。']| -> -0.015371320551957559
	context : ['よ', '。', 'EOS']| -> -5.082264860742065e-06
0.0


In [21]:
filled = fill_SYMBOL( sentence2normalize_noun(sentence) )

In [22]:
ngram_text = []
for L in filled:
    for i in range(len(L)-n+1):
            # print(L[i:i+N])
        ngram_text.append(L[i:i+n])
print(ngram_text)

[['FOS', 'FOS', 'はい', '、'], ['FOS', 'はい', '、', 'そう'], ['はい', '、', 'そう', 'です'], ['、', 'そう', 'です', 'よ'], ['そう', 'です', 'よ', '。'], ['です', 'よ', '。', 'EOS'], ['よ', '。', 'EOS', 'EOS']]


In [23]:
lm.perplexity(ngram_text)

3.9443869758379804

In [24]:
from datatools.maneger import DataManager

In [25]:
modelM = DataManager("../models/utterance/")
# model_name = "KLM_phrase_n={0}.pickle".format(n)
# model_name = "KLM_phrase_nucc_n={0}.pickle".format(n)
# model_name = "KLM_phrase_nucc_n={0}_orth.pickle".format(n)
model_name = "KLM_phrase_nucc_n={0}_noun2.pickle".format(n)

In [26]:

modelM.save_data(model_name, lm)

success save : ../models/utterance/KLM_phrase_nucc_n=4_noun2.pickle


In [27]:
lm = modelM.load_data(model_name)

success load : ../models/utterance/KLM_phrase_nucc_n=4_noun2.pickle


In [28]:
# errors = ['Grammatical error', "Uninterpretable"]

# y = []
# n=3
# y_pred = []
# for conv in convs:
#     for ut in conv:
#         if not ut.is_system():
#             continue
#         # エラーなら1
#         if ut.is_error_included(errors):
#             # print(ut)
#             y.append(1)
#         else:
#             y.append(0)
#         #LM 判定
#         # エラーなら1
#         if sentence2score(ut.utt, lm, N=n) < -5.6:
#             y_pred.append(1)
#         else:
#             y_pred.append(0)

        

In [29]:
# from sklearn import metrics
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
# print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))

# print('EM:', metrics.accuracy_score(y, y_pred))
# print('F-measure: ', metrics.f1_score(y, y_pred))

In [30]:
# for conv in convs:
#     for ut in conv:
#         if sentence2score(ut.utt, lm, N=3) < -5.5:
#             # print(ut.utt)
#             pass
#         else:
#             # print(ut.utt)
#             pass

In [31]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true=y, y_pred=y_pred)
sns.heatmap(cm, square=True, cbar=True, annot=True, cmap='Blues')
plt.savefig('sklearn_confusion_matrix.png')

NameError: name 'y' is not defined

In [None]:
print(lm.counts)

<NgramCounter with 3 ngram orders and 951263 ngrams>
