# 物語の感情曲線の抽出とその極値に着目したトピックの分析


In [None]:
import os
import pickle
import pyLDAvis.gensim
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pprint import pprint
from wordcloud import WordCloud
from tqdm import *
from labMTsimple.storyLab import *
from book import *
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.decomposition import PCA
from mvpa2.suite import SimpleSOMMapper
from pyvis.network import Network
from collections import defaultdict

sns.set(style='darkgrid')


In [11]:
DIST = 'dist/'
DATA = 'data/'
TMP = 'tmp/'
PICKLE = 'pickle/'
CLUSTERS = 'clusters/'
TF_IDF = 'tf_idf/'
TF = 'tf/'
IDF = 'idf/'
SRC = 'sources-mini.txt'
TEST = 'sources-test.txt'
WORDS_IN_WINDOW = 'words-window.txt'
WORD_DICT = 'word-dict.dict'
WORDS_CORPUS = 'words-corpus.mm'
EXCLUDE_RATE = 0.1
NUM_TOPICS = 500
labMT, labMTvector, labMTwordList = emotionFileReader(returnVector=True)


In [12]:
# sources.txtを読み込みます
infos = []
books = []
with open(DATA + SRC, 'r') as i:
    lines = i.readlines()
    for line in lines:
        line = line.strip()
        book_id, path, title = line.split(',', 2)
        infos.append([book_id, title, path])
for info in infos[:5]:
    print(info)


['15718', 'How To Write Special Feature Articles', '15718-cleaned.txt']
['16751', "McGuffey's Sixth Eclectic Reader", '16751-cleaned.txt']
['42474', '1000 Mythological Characters Briefly Described', '42474-cleaned.txt']
['51079', 'Ned, Bob and Jerry at Boxwood Hall', '51079-cleaned.txt']
['28617', 'Astounding Stories of Super-Science February 1930', '28617-cleaned.txt']


In [13]:
def calc_happiness(window):
    return emotion(window, labMT)


In [14]:
progress = tqdm(infos)
for info in progress:
    PICKLE_FILE = TMP + PICKLE + info[0] + '.pickle'
    if os.path.exists(PICKLE_FILE):
        with open(PICKLE_FILE, 'rb') as i:
            books.append(pickle.load(i))
    else:
        book = Book(info[0], info[1], info[2])
        try:
            book.load()
            book.windowed()
            book.calc_happiness(calc_happiness)
            with open(PICKLE_FILE, 'wb') as o:
                pickle.dump(book, o)
            books.append(book)
        except BookLoadingException as err:
            progress.set_description('{}は12000字を超えていません'.format(book.title()))




  0%|          | 0/100 [00:00<?, ?it/s][A[A

  1%|          | 1/100 [00:00<00:32,  3.03it/s][A[A

  2%|▏         | 2/100 [00:00<00:31,  3.09it/s][A[A

  3%|▎         | 3/100 [00:00<00:29,  3.32it/s][A[A

  4%|▍         | 4/100 [00:01<00:26,  3.67it/s][A[A

  5%|▌         | 5/100 [00:01<00:25,  3.73it/s][A[A

  6%|▌         | 6/100 [00:01<00:26,  3.58it/s][A[A

  7%|▋         | 7/100 [00:01<00:25,  3.62it/s][A[A

  8%|▊         | 8/100 [00:02<00:25,  3.62it/s][A[A

  9%|▉         | 9/100 [00:02<00:24,  3.74it/s][A[A

The Adventure of the Cardboard Boxは12000字を超えていません:   9%|▉         | 9/100 [00:02<00:24,  3.74it/s][A[A

The Adventure of the Cardboard Boxは12000字を超えていません:  11%|█         | 11/100 [00:02<00:21,  4.12it/s][A[A

The Adventure of the Cardboard Boxは12000字を超えていません:  12%|█▏        | 12/100 [00:03<00:22,  3.85it/s][A[A

The Adventure of the Cardboard Boxは12000字を超えていません:  13%|█▎        | 13/100 [00:03<00:20,  4.15it/s][A[A

Alice in Wonderlandは12000字を超え

McGuffey's First Eclectic Reader, Revised Editionは12000字を超えていません:  95%|█████████▌| 95/100 [00:25<00:01,  3.45it/s][A[A

McGuffey's First Eclectic Reader, Revised Editionは12000字を超えていません:  97%|█████████▋| 97/100 [00:25<00:00,  4.03it/s][A[A

McGuffey's First Eclectic Reader, Revised Editionは12000字を超えていません:  98%|█████████▊| 98/100 [00:26<00:00,  3.63it/s][A[A

McGuffey's First Eclectic Reader, Revised Editionは12000字を超えていません:  99%|█████████▉| 99/100 [00:26<00:00,  3.71it/s][A[A

McGuffey's First Eclectic Reader, Revised Editionは12000字を超えていません: 100%|██████████| 100/100 [00:26<00:00,  3.75it/s][A[A


In [28]:
def get_min_window(book):
    happinesses = book.happinesses()
    index = happinesses.index(min(happinesses))
    return index, book.windows()[index]


def get_max_window(book):
    happinesses = book.happinesses()
    index = happinesses.index(max(happinesses))
    return index, book.windows()[index]


def tf(book, index):
    FOLDER = TMP + TF_IDF + TF + '{}/'.format(book.book_id())
    if not os.path.exists(FOLDER):
        os.makedirs(FOLDER)
    FILE = '{}.txt'.format(index)
    PATH = FOLDER + FILE
    window = book.windows()[index].split()
    if not os.path.exists(PATH):
        word_freq = defaultdict(int)
        word_set = set(window)
        for word in word_set:
            word_freq[word] = window.count(word)
        with open(PATH, 'w') as o:
            for word in word_freq:
                o.write('{} {}\n'.format(word, word_freq[word]))
    else:
        with open(PATH, 'r') as o:
            word_freq = {}
            for line in o.readlines():
                word, freq = line.split()
                word_freq[word] = int(freq)
    return word_freq


def idf(book, index):
    FOLDER = TMP + TF_IDF + IDF + '{}/'.format(book.book_id())
    if not os.path.exists(FOLDER):
        os.makedirs(FOLDER)
    FILE = '{}.txt'.format(index)
    PATH = FOLDER + FILE
    window = book.windows()[index].split()
    if not os.path.exists(PATH):
        word_freq = defaultdict(int)
        word_set = set(window)
        for word in word_set:
            word_freq[word] = len(list(filter(lambda w: word in w.split(), book.windows())))
        with open(PATH, 'w') as o:
            for word in word_freq:
                o.write('{} {}\n'.format(word, word_freq[word]))
    else:
        with open(PATH, 'r') as o:
            word_freq = {}
            for line in o.readlines():
                word, freq = line.split()
                word_freq[word] = int(freq)
    return word_freq


def vectorize(book, index):
    VECTOR_CSV = TMP + TF_IDF + '{}.csv'.format(book.book_id())
    if os.path.exists(VECTOR_CSV):
        vectors = pd.read_csv(VECTOR_CSV)
    else:
        corpus = book.windows()
        vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
        X = vectorizer.fit_transform(corpus)
        vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
        vectors.to_csv(VECTOR_CSV)
    return vectors[index:index+1]


def tf_idf(book, index):
    tfs = tf(book, index)
    idfs = idf(book, index)
    vector = vectorize(book, index)
    return vector


def remove_general_words(window, vector):
    window = window.split()
    general_words = {word: value for word, value in 
                     vector[:int(len(vector) * EXCLUDE_RATE)]}
    too_reare_words = {word: value for word, value in 
                     vector[-int(len(vector) * EXCLUDE_RATE):]}
    return [word for word in window 
            if word not in general_words.keys() 
            and word not in too_reare_words.keys()]


In [29]:
def get_experimental(book):
    (index, window) = get_min_window(book)
    vector = tf_idf(book, index)
    vector_dict = {key: float(vector[key]) for key in vector.to_dict()
                   if key != 'Unnamed: 0' and float(vector[key]) != 0.0}
    vector_dict = sorted(vector_dict.items(), key=lambda x: x[1], reverse=True)
    return remove_general_words(window, vector_dict)


In [None]:
WORDS_IN_WINDOW_PATH = TMP + WORDS_IN_WINDOW
if not os.path.exists(WORDS_IN_WINDOW_PATH):
    texts = [get_experimental(book) for book in tqdm(books)]
    with open(WORDS_IN_WINDOW_PATH, 'w') as i:
        for words in texts:
            for word in words:
                i.write(word + ' ')
            i.write('\n')
else:
    with open(WORDS_IN_WINDOW_PATH, 'r') as i:
        texts = [[word for word in line.split()] 
                 for line in tqdm(i.readlines())]








  0%|          | 0/93 [00:00<?, ?it/s][A[A[A[A[A[A

In [None]:
texts = [' '.join(text) for text in texts]
print(len(texts))


## Self-Organizing MapおよびWard法による階層的クラスタリング


In [None]:
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b')
vectors = vectorizer.fit_transform(np.array(texts))
pprint(type(vectors.toarray()))


In [None]:
X = vectors.toarray()
print(X.shape)
pca = PCA()
X = pca.fit_transform(X)
print(type(X))
print(X.shape)


In [None]:
labels = [book.title() for book in books]


In [None]:
n_rows = 32
n_cols = 32
som = SimpleSOMMapper((n_rows, n_cols), 500, learning_rate=0.05)
som.train(X)


In [None]:
for vector in som.K.T:
    plt.imshow(vector)
plt.show()


In [None]:
df = pd.DataFrame()
Z = linkage(vectors.toarray(), method='ward', metric='euclidean')
threshold = 0.6*max(Z[:, 2])
dendrogram(Z, labels=labels, orientation='left', color_threshold=threshold)
plt.title('WARD METHOD')
plt.show()


In [None]:
pprint(Z[:10])


In [None]:
c = fcluster(Z, threshold, criterion='distance')
pprint(c)


In [None]:
clusters = {}
with open(TMP + CLUSTERS + 'clusters.txt', 'w') as f, \
        open(TMP + CLUSTERS + 'texts.txt', 'w') as o:
    for book, text, cls in zip(books, texts, c):
        if cls not in clusters:
            clusters[cls] = []
        clusters[cls].append((book, text))
        f.write('{} {}\n'.format(book.book_id(), cls))
        o.write('{} {}\n'.format(book.book_id(), text))


## クラスター内でのトピック分析

In [None]:
clusters = {}
books_dict = {book.book_id(): book for book in books}


In [None]:
with open(TMP + CLUSTERS + 'clusters.txt', 'r') as c, open(TMP + CLUSTERS + 'texts.txt', 'r') as t:
    for line1, line2 in zip(c.readlines(), t.readlines()):
        book_id, cls = line1.split()
        book_id, text = line2.split(None, 1)
        book_id = int(book_id)
        cls = int(cls)
        cluster = (books_dict[book_id], text.strip())
        if cls not in clusters:
            clusters[cls] = []
        clusters[cls].append(cluster)
for cls in clusters:
    print('CLASS: {}'.format(cls))
    for book, text in clusters[cls][:3]:
        print('  {}'.format(book))


In [None]:
texts = [[t for book, t in clusters[i]] for i in clusters]
for text in texts:
    print(' '.join(text[0].split()[:3]))


## 共起ネットワークを構築する


In [None]:
def generate_network(df):
    net = Network(height="1000px", width="95%", bgcolor="#FFFFFF", font_color="black", notebook=True)
    print(df)
    return net


In [None]:
test_df = pd.DataFrame([[1.2, 2.3, 3.1, 0.0], [1.2, 2.3, 3.1, 1.4]], columns=['hello', 'i', 'happy', 'this'])
net = generate_network(test_df)
