In [203]:
import nltk
from nltk.corpus import wordnet as wn
import gensim
from gensim import corpora


In [204]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yutorse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## データセットの準備

In [205]:
import os

path = get_ipython().run_line_magic('pwd', '')
path = os.path.join(path, "BBC News Summary/Summaries/")
directory_list = ['business', 'entertainment', 'politics', 'sport', 'tech']

documents = []
true_clusters = []
for i, directory in enumerate(directory_list):
    for filename in os.listdir(path + directory):
        with open(path + directory + "/" + filename, "r") as f:
            documents.append(f.read())
            true_clusters.append(i)

## 文書データの前処理

In [206]:
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")
stop_words.append("say")

def preprocess_word(word):
    word = word.lower()
    if word in ["",",","."]:
        return None
    if word in stop_words:
        return None
    lemma = wn.morphy(word)
    if lemma is None:
        return None
    elif lemma in stop_words:
        return None
    else:
        return lemma

def preprocess_document(document):
    words = nltk.word_tokenize(document)
    words = [preprocess_word(word) for word in words]
    words = [word for word in words if word is not None]
    return words

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yutorse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## トピック数5のときの分析

In [207]:
dictionary = corpora.Dictionary(preprocess_documents(documents))
corpus_ = [dictionary.doc2bow(document) for document in preprocess_documents(documents)]


In [208]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus_, num_topics=5, id2word=dictionary, alpha=0.3, eta=0.1,)

In [209]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.023*"mobile" + 0.013*"people" + 0.011*"phone" + 0.009*"mr" + 0.008*"gadget" + 0.008*"digital" + 0.007*"also" + 0.007*"services" + 0.007*"technology" + 0.007*"video"')
(1, '0.011*"people" + 0.010*"game" + 0.009*"use" + 0.007*"mr" + 0.007*"phone" + 0.007*"mobile" + 0.006*"make" + 0.006*"new" + 0.006*"technology" + 0.005*"search"')
(2, '0.006*"game" + 0.006*"first" + 0.006*"take" + 0.006*"gadget" + 0.005*"new" + 0.005*"also" + 0.005*"player" + 0.005*"us" + 0.005*"year" + 0.005*"get"')
(3, '0.012*"game" + 0.008*"music" + 0.008*"player" + 0.008*"make" + 0.006*"also" + 0.006*"technology" + 0.005*"best" + 0.005*"new" + 0.005*"mr" + 0.005*"dvd"')
(4, '0.011*"new" + 0.010*"system" + 0.008*"people" + 0.008*"software" + 0.008*"firm" + 0.007*"technology" + 0.007*"network" + 0.006*"also" + 0.006*"virus" + 0.006*"windows"')


In [210]:
document_IDs = [0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]

for id in document_IDs:
    print("document ID "+str(id)+":" ,end="")
    print(ldamodel.get_document_topics(corpus_[id]))

document ID 0:[(0, 0.5816814), (2, 0.3811845), (3, 0.017067546), (4, 0.015245868)]
document ID 200:[(2, 0.97987205)]
document ID 400:[(0, 0.88460475), (1, 0.103553586)]
document ID 600:[(1, 0.011776151), (2, 0.43793872), (3, 0.27307513), (4, 0.2743524)]
document ID 800:[(1, 0.9792152)]
document ID 1000:[(2, 0.97162217)]
document ID 1200:[(0, 0.15410425), (2, 0.19194071), (3, 0.64935446)]
document ID 1400:[(0, 0.012557187), (1, 0.11891402), (2, 0.33972657), (3, 0.51621884), (4, 0.012583399)]
document ID 1600:[(0, 0.5487747), (2, 0.33514914), (3, 0.10501678)]
document ID 1800:[(2, 0.971434)]


## トピック数5のときの可視化

In [211]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

lda_display = pyLDAvis.gensim_models.prepare(ldamodel, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(


## トピック数10のときの分析

In [212]:
dictionary = corpora.Dictionary(preprocess_documents(documents))
corpus_ = [dictionary.doc2bow(document) for document in preprocess_documents(documents)]

ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus_, num_topics=10, id2word=dictionary, alpha=0.3, eta=0.1,)

topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

document_IDs = [0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]

for id in document_IDs:
    print("document ID "+str(id)+":" ,end="")
    print(ldamodel.get_document_topics(corpus_[id]))

(0, '0.009*"us" + 0.006*"film" + 0.006*"game" + 0.005*"one" + 0.005*"make" + 0.005*"project" + 0.004*"speed" + 0.004*"take" + 0.004*"top" + 0.004*"first"')
(1, '0.016*"mobile" + 0.016*"game" + 0.014*"people" + 0.012*"technology" + 0.011*"phone" + 0.010*"music" + 0.008*"new" + 0.007*"use" + 0.007*"video" + 0.007*"also"')
(2, '0.015*"e-mail" + 0.007*"spam" + 0.007*"new" + 0.007*"virus" + 0.007*"text" + 0.006*"us" + 0.006*"people" + 0.006*"junk" + 0.005*"computer" + 0.004*"musician"')
(3, '0.012*"site" + 0.008*"people" + 0.007*"laser" + 0.007*"message" + 0.007*"virus" + 0.007*"software" + 0.006*"make" + 0.006*"use" + 0.006*"light" + 0.005*"silicon"')
(4, '0.011*"firm" + 0.011*"search" + 0.010*"system" + 0.010*"software" + 0.009*"security" + 0.007*"file" + 0.007*"windows" + 0.006*"people" + 0.005*"make" + 0.005*"new"')
(5, '0.010*"site" + 0.008*"hip-hop" + 0.008*"best" + 0.008*"mr" + 0.007*"people" + 0.006*"get" + 0.005*"one" + 0.004*"uk" + 0.004*"make" + 0.004*"last"')
(6, '0.014*"mr" + 0

## トピック数10のときの可視化

In [213]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(
