In [1]:
import gzip
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec, LdaMulticore

from gensim.corpora import Dictionary

import numpy as np
import pyLDAvis
from pyLDAvis import gensim

In [7]:
def preprocessing(sentence):
    return [word for word in simple_preprocess(sentence) if word not in STOPWORDS]

def read_sentences(filename):
    with gzip.open(filename, 'rb') as f:
        for line in f:
            yield preprocessing(line) 

In [9]:
%%time
sentences = list(read_sentences('reviews_data.txt.gz'))

CPU times: user 57.2 s, sys: 1.25 s, total: 58.4 s
Wall time: 58.4 s


In [10]:
len(sentences)

255404

## Model

In [11]:
%%time
model = Word2Vec(sentences, size=100, window=5, min_count=2)

CPU times: user 6min 16s, sys: 1.38 s, total: 6min 17s
Wall time: 3min 14s


## Reprezentacja wektorowa

In [12]:
model.wv.most_similar('good')

[('decent', 0.8360605239868164),
 ('great', 0.8224554657936096),
 ('excellent', 0.80601966381073),
 ('fair', 0.6572003364562988),
 ('ok', 0.6471773386001587),
 ('reasonable', 0.6194989085197449),
 ('average', 0.6183684468269348),
 ('nice', 0.6167936325073242),
 ('fantastic', 0.6078464984893799),
 ('terrific', 0.6004981398582458)]

In [13]:
model.wv.most_similar('bad')

[('terrible', 0.7258570194244385),
 ('horrible', 0.6899006366729736),
 ('awful', 0.6618738174438477),
 ('poor', 0.6270333528518677),
 ('negative', 0.6100442409515381),
 ('disliked', 0.5744665265083313),
 ('okay', 0.5672655701637268),
 ('ok', 0.5644590258598328),
 ('strange', 0.5598120093345642),
 ('unpleasant', 0.5580019950866699)]

In [14]:
model.wv.most_similar(positive=['bad', 'terrible'], negative=['good'])

[('horrible', 0.8581506013870239),
 ('awful', 0.8117585182189941),
 ('dreadful', 0.711002767086029),
 ('horrendous', 0.7018737196922302),
 ('shocking', 0.6971772313117981),
 ('horrific', 0.6857098340988159),
 ('appalling', 0.6665670871734619),
 ('horrid', 0.6525853872299194),
 ('nasty', 0.6495195627212524),
 ('disgusting', 0.6493038535118103)]

## Topic modeling - LDA

In [15]:
sentences_light = np.random.permutation(sentences)

In [16]:
sentences_light = sentences_light[:1000]

In [17]:
%%time
dictionary = Dictionary(sentences_light)

CPU times: user 114 ms, sys: 16 µs, total: 114 ms
Wall time: 113 ms


In [18]:
%%time
bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

CPU times: user 81.2 ms, sys: 8 µs, total: 81.2 ms
Wall time: 79.3 ms


In [21]:
%%time
lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=100, passes=20, workers=8)

CPU times: user 1min 14s, sys: 2.93 s, total: 1min 17s
Wall time: 1min 15s


In [22]:
for idx, topics in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topics))

Topic: 0 
Words: 0.033*"hotel" + 0.022*"good" + 0.017*"room" + 0.016*"great" + 0.015*"location" + 0.011*"stay" + 0.008*"breakfast" + 0.008*"away" + 0.008*"walk" + 0.008*"price"
Topic: 1 
Words: 0.026*"hotel" + 0.022*"staff" + 0.015*"room" + 0.013*"stay" + 0.013*"night" + 0.012*"location" + 0.012*"wonderful" + 0.011*"great" + 0.009*"chicago" + 0.007*"helpful"
Topic: 2 
Words: 0.026*"hotel" + 0.022*"room" + 0.010*"staff" + 0.009*"stay" + 0.009*"area" + 0.009*"excellent" + 0.009*"good" + 0.008*"stayed" + 0.008*"great" + 0.006*"breakfast"
Topic: 3 
Words: 0.039*"room" + 0.026*"hotel" + 0.009*"day" + 0.009*"service" + 0.009*"stayed" + 0.008*"staff" + 0.008*"quot" + 0.007*"stay" + 0.007*"small" + 0.007*"time"
Topic: 4 
Words: 0.039*"room" + 0.021*"quot" + 0.020*"bed" + 0.016*"hotel" + 0.012*"sheets" + 0.011*"like" + 0.011*"disgusting" + 0.009*"stains" + 0.008*"rooms" + 0.008*"place"
Topic: 5 
Words: 0.027*"stay" + 0.023*"aug" + 0.020*"hotel" + 0.020*"sep" + 0.018*"room" + 0.013*"time" + 0.01

## Wizualizacja tematów

In [23]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
