In [14]:
%%time
# Read reviews from file
import csv

with open('datasets/reviews.txt', newline='') as csvfile:
    all_reviews = csv.DictReader(csvfile, delimiter=',', quotechar='"')
    reviews = []
    for review in all_reviews:
        if review['LanguageCode'] == 'de':
            reviews.append(review['Title'] + " " + review['Text'])

In [15]:
%%time
# Delete too small or too big comments
reviews = [item for item in reviews if len(item) > 4 and len(item) < 1000]

# Delete comments with links (probably spam)
reviews = [item for item in reviews if item.find("href=") < 0]

# Delete special characters (this probably could be done better)
import re
reviews = [re.sub('[\r\n\.,!?()":;\-+=\/]', ' ', item) for item in reviews]

In [16]:
%%time
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem.snowball import GermanStemmer
g_stemmer = GermanStemmer()

from stop_words import get_stop_words

de_stop = get_stop_words('de')

final_reviews = []
for review in reviews:
    # Tokenize by words
    raw = review.lower()
    review = tokenizer.tokenize(raw)
    
    # Stem tokens
    review = ([g_stemmer.stem(i) for i in review])
    
    # Delete german stop words (maybe it should be more specific for fashion)
    review = ([i for i in review if i not in de_stop])
    
    final_reviews.append(review)

In [None]:
%%time
from gensim import corpora, models
import gensim

dictionary = corpora.Dictionary(final_reviews)
corpus = [dictionary.doc2bow(review) for review in final_reviews]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=5, num_words=5))