In [6]:
%%time
# Read reviews from file
import unicodecsv

with open('datasets/zalando-reviews-FB-release.csv') as csvfile:
    all_reviews = unicodecsv.DictReader(csvfile, delimiter=',', quotechar='"')
    reviews = []
    for review in all_reviews:
        if review['LanguageCode'] == 'de':
            reviews.append(review['Title'] + " " + review['Text'])

CPU times: user 17.9 s, sys: 810 ms, total: 18.8 s
Wall time: 24.7 s


In [7]:
len(reviews)

1035254

In [None]:
%%time
# Delete too small or too big comments
reviews = [item for item in reviews if len(item) > 4 and len(item) < 1000]

# Delete comments with links (probably spam)
reviews = [item for item in reviews if item.find("href=") < 0]

# Delete special characters (this probably could be done better)
import re
reviews = [re.sub('[\r\n\.,!?()":;\-+=\/]', ' ', item) for item in reviews]

In [5]:
%%time
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem.snowball import GermanStemmer
g_stemmer = GermanStemmer()

from stop_words import get_stop_words

de_stop = get_stop_words('de')

final_reviews = []
for review in reviews:
    # Tokenize by words
    raw = review.lower()
    review = tokenizer.tokenize(raw)
    
    # Stem tokens
    review = ([g_stemmer.stem(i) for i in review])
    
    # Delete german stop words (maybe it should be more specific for fashion)
    review = ([i for i in review if i not in de_stop])
    
    final_reviews.append(review)

CPU times: user 3min 17s, sys: 1.2 s, total: 3min 18s
Wall time: 3min 23s


In [4]:
%%time
from gensim import corpora, models
import gensim

dictionary = corpora.Dictionary(final_reviews)
corpus = [dictionary.doc2bow(review) for review in final_reviews]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

210489

In [19]:
topics = ldamodel.print_topics(num_topics=20, num_words=5)

for t in topics:
    print(t)

[(0, '0.088*"gross" + 0.043*"klein" + 0.042*"leid" + 0.032*"schon" + 0.032*"fallt"'), (1, '0.043*"jack" + 0.040*"wied" + 0.040*"gut" + 0.037*"all" + 0.036*"sup"'), (2, '0.069*"schuh" + 0.032*"fur" + 0.023*"schon" + 0.017*"fuss" + 0.015*"gut"'), (3, '0.072*"gut" + 0.070*"sup" + 0.053*"toll" + 0.049*"schon" + 0.029*"passt"'), (4, '0.041*"schon" + 0.029*"kleid" + 0.026*"farb" + 0.023*"fur" + 0.020*"schnitt"')]
CPU times: user 1h 8min 29s, sys: 1.86 s, total: 1h 8min 31s
Wall time: 1h 35min 40s


In [None]:
topics = ldamodel.print_topics(num_topics=20, num_words=5)

for t in topics:
    print(t)