In [3]:
%%time
# Read reviews from file
import unicodecsv

with open('datasets/zalando-reviews-FB-release.csv') as csvfile:
    all_reviews = unicodecsv.DictReader(csvfile, delimiter=',', quotechar='"')
    reviews = []
    for review in all_reviews:
        if review['LanguageCode'] == 'en':
            reviews.append(review['Title'] + " " + review['Text'])

CPU times: user 15.6 s, sys: 370 ms, total: 16 s
Wall time: 17.1 s


In [4]:
len(reviews)

15933

In [5]:
%%time
# Delete too small or too big comments
reviews = [item for item in reviews if len(item) > 4 and len(item) < 1000]

# Delete comments with links (probably spam)
reviews = [item for item in reviews if item.find("href=") < 0]

# Delete special characters (this probably could be done better)
import re
reviews = [re.sub('[\r\n\.,!?()":;\-+=\/]', ' ', item) for item in reviews]

CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 59.5 ms


In [6]:
%%time
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem.snowball import EnglishStemmer
e_stemmer = EnglishStemmer()

from stop_words import get_stop_words

en_stop = get_stop_words('en')

final_reviews = []
for review in reviews:
    # Tokenize by words
    raw = review.lower()
    review = tokenizer.tokenize(raw)
    
    # Stem tokens
    review = ([e_stemmer.stem(i) for i in review])
    
    # Delete german stop words (maybe it should be more specific for fashion)
    review = ([i for i in review if i not in en_stop])
    
    final_reviews.append(review)

CPU times: user 3.81 s, sys: 30 ms, total: 3.84 s
Wall time: 3.95 s


In [7]:
%%time
from gensim import corpora, models
import gensim

dictionary = corpora.Dictionary(final_reviews)
corpus = [dictionary.doc2bow(review) for review in final_reviews]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

CPU times: user 5min 34s, sys: 300 ms, total: 5min 34s
Wall time: 5min 34s


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [8]:
topics = ldamodel.print_topics(num_topics=20, num_words=5)

for t in topics:
    print(t)

(0, u'0.401*"boot" + 0.044*"comfort" + 0.027*"look" + 0.023*"love" + 0.020*"brown"')
(1, u'0.083*"jean" + 0.081*"great" + 0.045*"look" + 0.039*"smart" + 0.037*"love"')
(2, u'0.126*"just" + 0.051*"want" + 0.034*"right" + 0.029*"expens" + 0.027*"one"')
(3, u'0.238*"veri" + 0.134*"nice" + 0.091*"happi" + 0.056*"pleas" + 0.037*"comfi"')
(4, u'0.144*"top" + 0.126*"dress" + 0.031*"ok" + 0.030*"love" + 0.029*"long"')
(5, u'0.034*"warm" + 0.033*"coat" + 0.021*"wear" + 0.016*"winter" + 0.015*"s"')
(6, u'0.268*"great" + 0.093*"jacket" + 0.060*"valu" + 0.055*"qualiti" + 0.050*"fit"')
(7, u'0.113*"bag" + 0.051*"jumper" + 0.033*"poor" + 0.032*"qualiti" + 0.032*"describ"')
(8, u'0.242*"love" + 0.092*"colour" + 0.074*"beauti" + 0.042*"perfect" + 0.036*"gorgeous"')
(9, u'0.118*"comfort" + 0.072*"veri" + 0.068*"perfect" + 0.045*"shoe" + 0.042*"fit"')
(10, u'0.149*"shoe" + 0.048*"pair" + 0.023*"feet" + 0.018*"bought" + 0.017*"narrow"')
(11, u'0.099*"excel" + 0.078*"trainer" + 0.071*"great" + 0.068*"serv

In [9]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [10]:
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
