In [None]:
import pandas as pd
import MeCab
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# 分かち書きの中で使うオブジェクト生成
tagger = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
# ひらがなのみの文字列にマッチする正規表現
kana_re = re.compile("^[ぁ-ゖ]+$")


def mecab_tokenizer(text):
    # テキストを分かち書きする関数を準備する
    parsed_lines = tagger.parse(text).split("\n")[:-2]
    surfaces = [l.split('\t')[0] for l in parsed_lines]
    features = [l.split('\t')[1] for l in parsed_lines]
    # 原型を取得
    bases = [f.split(',')[6] for f in features]
    # 品詞を取得
    pos = [f.split(',')[0] for f in features]

    # 各単語を原型に変換する
    token_list = [b if b != '*' else s for s, b in zip(surfaces, bases)]

    # 名詞,動詞,形容詞のみに絞り込み
    target_pos = ["名詞", "動詞", "形容詞"]
    token_list = [t for t, p in zip(token_list, pos) if p in target_pos]
    # アルファベットを小文字に統一
    token_list = [t.lower() for t in token_list]
    # ひらがなのみの単語を除く
    token_list = [t for t in token_list if not kana_re.match(t)]
    # 数値を含む単語も除く
    token_list = [t for t in token_list if not re.match("\d", t)]
    return " ".join(token_list)


In [None]:
from scipy.spatial import distance
import MeCab
import numpy as np

# def calc_similarity(topic1, topic2, window_size=5):
def calc_similarity(topic1, topic2, n_components=5):
    # テキストデータをBOW形式に変換する
    tf_vectorizer = CountVectorizer(
        token_pattern='(?u)\\b\\w+\\b',
#         max_df=0.90,
#         min_df=10,
    )
    lda_vectorizer = LatentDirichletAllocation(n_components=n_components)
    
    def train(utterances):
        tokenized_utterances = [mecab_tokenizer(utt) for utt in utterances]
        tf = tf_vectorizer.fit_transform(tokenized_utterances)
        # LDAの学習
        lda_vectorizer.fit(tf)
    
    def predict(utterance):
        tokenized_utterance = mecab_tokenizer(utterance)
        tf = tf_vectorizer.transform([tokenized_utterance])
        return lda_vectorizer.transform(tf)
    
    train(topic1 + topic2)
    
    vec1 = predict(' '.join(topic1))
    vec2 = predict(' '.join(topic2))
    
#     vectors = calc_vectors(topic1 + topic2, n_components=n_components)
#     n1 = len(topic1)
#     vec1 = np.mean(vectors[:n1], axis=0)
#     vec2 = np.mean(vectors[n1:], axis=0)

#     vec1 = np.mean(vectors[n1-window_size:n1], axis=0)
#     vec2 = np.mean(vectors[n1:n1+window_size], axis=0)
    return 1 - distance.cosine(vec1, vec2)

In [None]:
calc_similarity(['コンサートとかには行きますか？'], ['コンサートは行かないですけど、映画とかは好きですね'])

In [None]:
calc_similarity(['コンサートとかには行きますか？'], ['コンサートは行かないですけど、映画とかは好きですね', '映画といえば、閃光のハサウェイは見ましたか？'])

In [None]:
calc_similarity(['コンサートとかには行きますか？'], ['織田信長の野望'])

In [None]:
calc_similarity(['豊臣秀吉の狙いはなんだったのか？'], ['織田信長の野望'])

In [None]:
def topic_generator(text_gen, threshold=0.3, max_utterances=50, min_utterances=10, delay=3, n_components=5):
    cur_topic = [next(text_gen)]
    
    for text in text_gen:
        similarity = calc_similarity(cur_topic[:-delay], cur_topic[-delay:] + [text], n_components=n_components)
        if (
            similarity < threshold
            or len(cur_topic) >= max_utterances
        ) and len(cur_topic) >= min_utterances + delay:
            yield cur_topic[:-delay]
            cur_topic = cur_topic[-delay:] + [text]
        else:
            cur_topic.append(text)

In [None]:
import jsonlines
import unicodedata
import re

def transcript_generator(path):
    with jsonlines.open(path) as reader:
        for data in reader:
            yield preprocess_transcript(data['transcript'])

def preprocess_transcript(transcript):
    transcript = unicodedata.normalize('NFKC', transcript)
    transcript = transcript.replace('|', ' ')
    transcript = re.sub(r'\(.+\)', '', transcript)
    return transcript

In [None]:
topic_gen = topic_generator(
    transcript_generator('data/golden_transcripts/1911F2002.jsonl'),
    threshold=0.2,
    max_utterances=50,
    min_utterances=10,
    delay=3,
    n_components=5
)

for topic in topic_gen:
    print(topic, '\n')
    