# nltkの文書群にgensimのLDAを適用してみる

## 準備

必要ライブラリ(全部pipで入ります)
 * nltk
 * gensim
 * pyLDAvis

In [2]:
!pip install nltk
!pip install gensim
!pip install pyLDAvis
!pip install --upgrade pandas==1.2

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/03/a5/15a0da6b0150b8b68610cc78af80364a80a9a4c8b6dd5ee549b8989d4b60/pyLDAvis-3.3.1.tar.gz (1.7MB)
[K     |████████████████████████████████| 1.7MB 11.6MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting numpy>=1.20.0
[?25l  Downloading https://files.pythonhosted.org/packages/a5/42/560d269f604d3e186a57c21a363e77e199358d054884e61b73e405dd217c/numpy-1.20.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.3MB)
[K     |████████████████████████████████| 15.3MB 243kB/s 
Collecting pandas>=1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/51/51/48f3fc47c4e2144da2806dfb6629c4dd1fa3d5a143f9652b141e979a8ca9/pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl (9.9MB)
[K     |████████████████████████████████| 9

Collecting pandas==1.2
[?25l  Downloading https://files.pythonhosted.org/packages/ff/bd/fb376f9fbad92b9a6efdbb30ff32c80f3cba1368689309cbb5566364af5c/pandas-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (9.9MB)
[K     |████████████████████████████████| 9.9MB 16.1MB/s 
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.1.0; python_version >= "3.0", but you'll have pandas 1.2.0 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 1.2.4
    Uninstalling pandas-1.2.4:
      Successfully uninstalled pandas-1.2.4
Successfully installed pandas-1.2.0


In [3]:
#nltk使ったことない人は、pip install してから対話環境等で以下のdataset等をダウンロードしてください
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("reuters")
nltk.download("punkt")
nltk.download("brown")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## データロード・前処理

In [4]:
#dataset読み込み
from nltk.corpus import brown as corpus

### 今回はこういう文書(をBOW化したもの)を用います

In [5]:
#必要に応じて以下のコードを実行してください．
# !unzip /root/nltk_data/corpora/brown.zip -d /root/nltk_data/corpora

In [6]:
for n,item in enumerate(corpus.words(corpus.fileids()[0])[:300]):
    print(item, end=" ")
    if (n%25) ==24:
      print(" ")

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .  
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise  
and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged  
by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan  
Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in  
the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's  
registration and election laws `` are outmoded or inadequate and often ambiguous '' . It recommended that Fulton legislators act `

In [7]:
#全document数
len(corpus.fileids())

500

In [8]:

#前からk個のdocumentのみで学習する場合
k=1000
docs=[corpus.words(fileid) for fileid in corpus.fileids()[:k]]

#全documentで学習する場合
# docs=[corpus.words(fileid) for fileid in corpus.fileids()]

# print(docs[:5])
# print("num of docs:", len(docs))

## 前処理 

In [24]:
#ストップワードリストの作成

#1 nltkのストップワードリスト
en_stop = nltk.corpus.stopwords.words('english')

# 一度LDAしてみる等して，適宜ノイズになってそうな記号等を見つけて，ストップワードリストに新たに加える
#【発展】記号や数字は正規表現で消してみましょう
en_stop= ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<", "\'\'"]                  \
         +["0","1","2","3","4","5","6","7","8","9","10","11","12","86","1986","1987","000"]                                                      \
         +["said","say","u","v","mln","ct","net","dlrs","tonne","pct","shr","nil","company","lt","share","year","billion","price"]          \
         +en_stop

In [25]:
#前処理関数の作成

from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

def preprocess_word(word, stopwordset):
    
    #1.make words lower  example: Python =>python
    word=word.lower()
    
    #2.remove "," and "."
    if word in [",","."]:
        return None
    
    #3.remove stopword  example: the => (None) 
    if word in stopwordset:
        return None
    
    #4.lemmatize  example: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: #lemmatizeしたものがstopwordである可能性がある
        return None
    else:
        return lemma
    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [26]:
#before
print(docs[0][:25]) 

#after
print(preprocess_documents(docs)[0][:25])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['fulton', 'county', 'grand', 'jury', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produce', 'evidence', 'irregularity', 'take', 'place', 'jury', 'term-end', 'presentment', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', 'deserve']


## LDA準備

In [27]:
import gensim
from gensim import corpora

In [28]:
#documentを，gensim LDAが読み込めるデータ構造にする

#辞書の作成
dictionary = corpora.Dictionary(preprocess_documents(docs))
#コーパスの作成
corpus_ = [dictionary.doc2bow(doc) for doc in preprocess_documents(docs)]

In [29]:
#Dictionary:gensimにおける辞書クラス
#token2id属性には単語と辞書IDとの対応が格納される

print(dictionary.token2id)



In [30]:
#corpusにはdocumentごとに単語の(ID、出現回数)のリストが得られる

print(corpus_[0][:10]) #文章での出現順でなく辞書IDの若い順なことに注意

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


In [31]:
#before
print([w.lower() for w in corpus.sents(corpus.fileids()[0])[0]])

#after
print(dictionary.doc2bow([w.lower() for w in corpus.sents(corpus.fileids()[0])[0]]))

#これを全文書の全文に適用したのがcorpus_

['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
[(68, 1), (149, 1), (196, 1), (214, 1), (247, 1), (250, 1), (273, 1), (312, 1), (327, 1), (434, 1), (454, 1), (487, 1)]


## LDA学習

In [32]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus_,
                                           num_topics=20,
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           #minimum_probability=0.0    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

## パラメータの確認

In [33]:
#(トピックID, 当該トピックにおける単語とそのprobability)  ※　のうち、上位num_words位 

topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.006*"would" + 0.005*"one" + 0.004*"make" + 0.004*"af" + 0.003*"first" + 0.003*"may" + 0.003*"take" + 0.003*"could" + 0.003*"go" + 0.002*"new"')
(1, '0.005*"one" + 0.003*"would" + 0.003*"make" + 0.003*"could" + 0.003*"go" + 0.003*"know" + 0.003*"time" + 0.003*"!" + 0.002*"state" + 0.002*"come"')
(2, '0.006*"one" + 0.003*"!" + 0.003*"make" + 0.003*"time" + 0.003*"would" + 0.003*"get" + 0.002*"come" + 0.002*"even" + 0.002*"two" + 0.002*"go"')
(3, '0.004*"would" + 0.003*"one" + 0.003*"new" + 0.003*"take" + 0.003*"make" + 0.002*"state" + 0.002*"two" + 0.002*"go" + 0.002*"!" + 0.002*"could"')
(4, '0.006*"one" + 0.004*"make" + 0.004*"!" + 0.003*"would" + 0.003*"time" + 0.003*"af" + 0.003*"could" + 0.003*"come" + 0.002*"new" + 0.002*"know"')
(5, '0.005*"!" + 0.004*"one" + 0.003*"state" + 0.003*"would" + 0.003*"go" + 0.003*"make" + 0.003*"two" + 0.003*"take" + 0.003*"get" + 0.003*"time"')
(6, '0.008*"one" + 0.004*"would" + 0.003*"make" + 0.003*"!" + 0.003*"could" + 0.003*"new" + 0.003*"c

In [34]:
#[(当該documentにおけるトピックIDとそのprobability　)]　 ※　のうち、minimum_probabilityの値を超えるもの

for n,item in enumerate(corpus_[:10]):
    print("document ID "+str(n)+":" ,end="")
    print(ldamodel.get_document_topics(item))

document ID 0:[(7, 0.841395), (8, 0.15664777)]
document ID 1:[(7, 0.9242084), (11, 0.07260969)]
document ID 2:[(7, 0.93031645), (13, 0.066695206)]
document ID 3:[(1, 0.055201612), (7, 0.20148088), (8, 0.61506677), (15, 0.11421084)]
document ID 4:[(7, 0.47674188), (8, 0.50137556), (18, 0.02004698)]
document ID 5:[(3, 0.17853875), (7, 0.5311883), (8, 0.2261384), (19, 0.056863558)]
document ID 6:[(7, 0.010422296), (8, 0.034991596), (11, 0.13448709), (15, 0.6692389), (18, 0.14957444)]
document ID 7:[(8, 0.29464525), (10, 0.057736173), (15, 0.64272654)]
document ID 8:[(7, 0.8043988), (8, 0.19359688)]
document ID 9:[(0, 0.36076716), (7, 0.013517619), (8, 0.6242353)]


In [35]:
#documentのcategory
categories=[corpus.categories(fileid) for fileid in corpus.fileids()]

In [36]:
n=0

#n番目のdocumentのトピック分布
print(ldamodel.get_document_topics(corpus_[n]))

#n番目のdocumentのcategory
print(categories[n])

#n番目のdocumentの生の文章
print(" ".join(docs[n]))

[(7, 0.82661647), (8, 0.17081526)]
['news']


## 可視化

In [37]:
#import pyLDAvis.gensim
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [38]:
#全documentを学習に用いた場合結構時間がかかる(20min~)
#gensimではK個のトピックに0~K-1のidが割り振られていたのに対し，pyLDAvisでは1~Kのidが割り振られていることに注意

lda_display = pyLDAvis.gensim_models.prepare(ldamodel, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

ImportError: ignored

In [None]:
#上で可視化したモデルをgoogle drive上にsaveできる

pyLDAvis.save_html(lda_display,'vis.html')

In [None]:
!ls