# 自然言語処理: LDA

参考：
* [Gensim](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Corpora_and_Vector_Spaces.ipynb)
* [Gensim preprocessing & training LDA](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/lda_training_tips.ipynb)
* [NLTK]()
* [Sam Roweis' website](http://www.cs.nyu.edu/~roweis/data.html) : Download texts of NIPS papers

## Libraries

In [1]:
import os
from nltk.tokenize import regexp_tokenize
from gensim import corpora
from smart_open import smart_open
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

## Example data

In [2]:
# Folder containing all NIPS papers.
data_dir = './nipstxt/'

# Folders containin individual NIPS papers.
yrs = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dirs = ['nips' + yr for yr in yrs]

# Read all texts into a list.
docs = []
for yr_dir in dirs:
    files = os.listdir(data_dir + yr_dir)
    for filen in files:
        # Note: ignoring characters that cause encoding errors.
        with smart_open(data_dir + yr_dir + '/' + filen, 'rb') as fid:
            txt = str(fid.read()).replace("\\n", "")
        docs.append(txt)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Create dictionary & corpus

In [3]:
# tokenize the documents
tokens = [regexp_tokenize(d.lower(), r"\w+") for d in docs] # using NLTK library

# Remove numbers, but not words that contain numbers.
tokens = [[t for t in token if not t.isnumeric()] for token in tokens]

# Remove words that are only one character.
tokens = [[t for t in token if len(t) > 1] for token in tokens]

tokens[0][:10]

['connectivity',
 'versus',
 'entropy',
 'yaser',
 'abu',
 'mostafa',
 'california',
 'institute',
 'of',
 'technology']

In [4]:
# Create dictionary with the tokens
dictionary = corpora.Dictionary(tokens)
# dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))  # store the dictionary, for future reference
print(dictionary)

Dictionary(83764 unique tokens: ['0a', '2h', '2h2', '2he', '2n']...)


In [5]:
# filtering of tokens
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
print(dictionary)

Dictionary(7412 unique tokens: ['2n', '_c', 'a2', 'ability', 'abu']...)


In [6]:
# how to use
print(dictionary.token2id["2n"]) # token=2nのid
print(dictionary.dfs[0]) # id=0 (token="2n")が現れた回数

0
97


In [7]:
# コーパス
corpus = [dictionary.doc2bow(t) for t in tokens]
corpus[0][:10]

[(0, 4),
 (1, 1),
 (2, 1),
 (3, 2),
 (4, 4),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

In [8]:
# 新しい文書を辞書で
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
new_vec

[(169, 1), (1929, 1)]

## Save&Load corpus

In [9]:
# Save corpus as efficient file formats 
corpora.MmCorpus.serialize(os.path.join("./", 'corpus.mm'), corpus) # MatrixMarket (mtx) format
# corpora.SvmLightCorpus.serialize(os.path.join("./", 'corpus.svmlight'), corpus) # SVM light

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [10]:
# Load corpus
corpus = corpora.MmCorpus(os.path.join("./", 'corpus.mm'))
corpus

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


<gensim.corpora.mmcorpus.MmCorpus at 0x1a35f86bd8>

In [11]:
corpus = list(corpus)
corpus[0][:10]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(0, 4.0),
 (1, 1.0),
 (2, 1.0),
 (3, 2.0),
 (4, 4.0),
 (5, 1.0),
 (6, 1.0),
 (7, 1.0),
 (8, 1.0),
 (9, 1.0)]

## gensim: LDA

In [12]:
from gensim.models import LdaModel

* n_topics: トピック数
* chunksize: 学習の1ステップで何個のDocumentを使うか。学習の結果に強く影響する（[ref](https://www.di.ens.fr/~fbach/mdhnips2010.pdf)）
* passes: エポック数。各documentが何回学習に用いられるか。
* iterations
* eval_every: 
* alpha, eta="auto": 最適なハイパーパラメータを自動で探索する
* coherence: トピックの解釈性の高さを表す指標([ref](https://www.slideshare.net/hoxo_m/coherence-57598192))。いくつかの手法が実装されており選択できる([ref](https://qiita.com/tatsuya-miyamoto/items/7d49959c74f3c1e0cf63))。u_mass以外は学習用データとは異なるテキストが必要。RWDにはu_mass以外選択の余地がなさそう。

In [15]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.
alpha='auto'
eta='auto'

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus[:100], id2word=id2word, \
                       chunksize=chunksize, alpha=alpha, eta=eta, \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every) 

CPU times: user 6.84 s, sys: 34.6 ms, total: 6.87 s
Wall time: 3.49 s


In [16]:
coherence="u_mass"
top_topics = model.top_topics(corpus[:100], topn=20, coherence=coherence)
top_topics[0]

([(0.011323489, 'memory'),
  (0.0060359915, 'matrix'),
  (0.005187462, 'net'),
  (0.0051225144, 'vectors'),
  (0.0048700874, 'capacity'),
  (0.004830101, 'associative'),
  (0.0046276394, 'units'),
  (0.0045003598, 'stored'),
  (0.004232521, 'patterns'),
  (0.00415348, 'fig'),
  (0.0040757237, 'hopfield'),
  (0.003869393, 'layer'),
  (0.0034759135, 'equation'),
  (0.0034571607, 'sequence'),
  (0.0034305546, 'recall'),
  (0.0031486442, 'image'),
  (0.0031168624, 'control'),
  (0.0030738767, 'xf8'),
  (0.0029989926, 'optical'),
  (0.0027731087, 'signal')],
 -0.8729128881423159)

In [17]:
# Average topic coherence
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

Average topic coherence: -1.3912.


In [18]:
pyLDAvis.gensim.prepare(model, corpus[:100], dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
