# トピック分析


In [1]:
import os
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm
from book import *
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.models.ldamodel import CoherenceModel
from labMTsimple.storyLab import *
from collections import defaultdict


In [2]:
DATA = 'data/'
DIST = 'dist/'
TMP = 'tmp/'
TEXT = 'text/'
PICKLE = 'pickle/'
MODEL = 'model/'

SRC = 'sources.txt'
TEST = 'sources-test.txt'
MIN_TEXT = 'texts-min.txt'

NUM_TOPICS = 16


In [3]:
sns.set(style='darkgrid')


In [4]:
labMT, labMTvector, labMTwordlist = emotionFileReader(returnVector=True)


In [5]:
def load_array(file, delim=None, cnt=None):
    with open(file, 'r') as f:
        if cnt:
            return [[elem.strip() for elem in line.split(delim, cnt)]
                    for line in f.readlines()]
        else:
            return [[elem.strip() for elem in line.split(delim)]
                    for line in f.readlines()]


def fprint_array(file_name, array):
    with open(file_name, 'w') as f:
        for vector in array:
            for elem in vector:
                f.write('{} '.format(elem))
            f.write('\n')


In [6]:
infos = load_array(DATA + SRC, ',', 2)
pprint(infos[:10])


[['12', '12-cleaned.txt', 'Through the Looking-Glass'],
 ['16', '16-cleaned.txt', 'Peter Pan'],
 ['20', '20-cleaned.txt', 'Paradise Lost'],
 ['21', '21-cleaned.txt', 'Aesop’s Fables'],
 ['24', '24-cleaned.txt', 'O Pioneers!'],
 ['32', '32-cleaned.txt', 'Herland'],
 ['35', '35-cleaned.txt', 'The Time Machine'],
 ['36', '36-cleaned.txt', 'The War of the Worlds'],
 ['41', '41-cleaned.txt', 'The Legend of Sleepy Hollow'],
 ['45', '45-cleaned.txt', 'Anne of Green Gables']]


In [7]:
def calc_happiness(window):
    return emotion(window, labMT)


In [8]:
books = {}
for book_id, file_name, title in tqdm(infos):
    FILE = TMP + PICKLE + '{}.pickle'.format(book_id)
    book = Book(book_id, title, file_name)
    if not os.path.exists(FILE):
        try:
            book.load()
            book.windowed()
            book.calc_happiness(calc_happiness)
            books[int(book_id)] = book
            with open(FILE, 'wb') as f:
                pickle.dump(book, f)
        except BookLoadingException as err:
            pass
    else:
        with open(FILE, 'rb') as f:
            books[int(book_id)] = pickle.load(f)


100%|██████████| 1254/1254 [08:05<00:00,  2.58it/s]


In [9]:
print(len(books))


1159


In [10]:
def get_min_window(book):
    happinesses = book.happinesses()
    i = happinesses.index(min(happinesses))
    local_array = []
    local_array.append(i)
    for word in book.windows()[i].split():
        local_array.append(word)
    return local_array


In [11]:
MIN_TEXT_PATH = TMP + MIN_TEXT
if not os.path.exists(MIN_TEXT_PATH):
    min_windows = [get_min_window(books[book_id]) for book_id in books]
    fprint_array(MIN_TEXT_PATH, min_windows)
else:
    min_windows = load_array(MIN_TEXT_PATH)
min_windows = [array[1:] for array in min_windows]


In [12]:
print(len(min_windows))
print(len(min_windows[0]))


1159
10000


In [13]:
DICTIONARY_PATH = TMP + 'dictionary.dict'
if os.path.exists(DICTIONARY_PATH):
    dictionary = Dictionary.load(DICTIONARY_PATH)
else:
    dictionary = Dictionary(min_windows)
    dictionary.filter_extremes(no_below=2, no_above=0.75)
    dictionary.save(DICTIONARY_PATH)


In [14]:
MODEL_FOLDER = TMP + MODEL
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)

corpus = [dictionary.doc2bow(text) for text in min_windows]
models = {}

for topic_num in tqdm(range(1, NUM_TOPICS)):
    MODEL_PATH = MODEL_FOLDER + 'topic-{}.model'.format(topic_num)
    if not os.path.exists(MODEL_PATH):
        lda_model = LdaModel(corpus=corpus, num_topics=topic_num, id2word=dictionary)
        lda_model.save(MODEL_PATH)
    else:
        lda_model = LdaModel.load(MODEL_PATH)
    models[topic_num] = lda_model


100%|██████████| 15/15 [03:46<00:00, 15.08s/it]


## テストデータで分析


In [15]:
TEST_PATH = DATA + TEST
test_infos = load_array(TEST_PATH, ',', 2)
pprint(test_infos[:10])


[['37332', '37332-cleaned.txt', 'A Little Princess'],
 ['43', '43-cleaned.txt', 'The Strange Case Of Dr. Jekyll And Mr. Hyde'],
 ['11', '11-cleaned.txt', 'Alice’s Adventures in Wonderland'],
 ['98', '98-cleaned.txt', 'A Tale of Two Cities'],
 ['1974', '1974-cleaned.txt', 'Poetics']]


In [16]:
test_books = {}
for book_id, file_name, title in test_infos:
    FILE = TMP + PICKLE + '{}.pickle'.format(book_id)
    if not os.path.exists(FILE):
        try:
            book = Book(book_id, title, file_name)
            book.load()
            book.windowed()
            book.calc_happiness(calc_happiness)
            test_books[int(book_id)] = book
            with open(FILE, 'wb') as f:
                pickle.dump(book, f)
        except BookLoadingException as err:
            pass
            
    else:
        with open(FILE, 'rb') as f:
            test_books[int(book_id)] = pickle.load(f)
print(len(test_books))


5


In [17]:
test_windows = []
labels = []
for book_id in test_books:
    book = test_books[book_id]
    window = get_min_window(book)[1:]
    test_windows.append(window)
    labels.append(book.title())
test_dictionary = Dictionary(test_windows)
test_dictionary.filter_extremes(no_below=2, no_above=0.75)
test_corpus = [test_dictionary.doc2bow(text) for text in test_windows]


In [18]:
for max_topic in range(1, NUM_TOPICS):
    model = models[max_topic]
    score_by_topic = defaultdict(int)
    topic_array = [[0.0 for j in range(max_topic)] for i in range(len(labels))]
    for index, unseen_doc in enumerate(test_corpus):
        for topic, score in model[unseen_doc]:
            topic_array[index][int(topic)] = float(score)
    df = pd.DataFrame(topic_array)
    df.to_excel(DIST + '{}.xlsx'.format(max_topic))


## 評価


### Perplexityによる評価


In [19]:
perplexities = []
for max_topic in range(1, NUM_TOPICS):
    model = models[max_topic]
    perwordbound = model.log_perplexity(test_corpus)
    perplexities.append([max_topic, np.exp2(-perwordbound)])
fprint_array(DIST + 'perplexity.txt', perplexities)


### Coherenceによる評価


In [20]:
coherences = []
for i in range(1, NUM_TOPICS):
    lda = models[i]

    cm = CoherenceModel(model=lda, 
                        texts=test_windows, 
                        dictionary=test_dictionary, 
                        coherence='c_v')
    coherence = cm.get_coherence()
    coherences.append([i, coherence])
fprint_array(DIST + 'coherence.txt', coherences)


KeyError: 1567

### 可視化する


In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax1.plot([a[1] for a in coherences], color='r')
ax2.plot([perplexity[1] for perplexity in perplexities], color='b')

plt.show()
