In [1]:
import pandas as pd
import gensim

In [3]:
df = pd.read_csv("data/dota2.csv", sep=";")

# Exploration of reviews

In [5]:
df.describe()

Unnamed: 0,review,date_posted
count,9720,9720
unique,8240,8472
top,Nice game,"Nov 13, 2014, 10:08AM"
freq,31,28


In [10]:
df["review"].str.len().describe()

count    9720.000000
mean      210.752263
std       511.143711
min         2.000000
25%        30.000000
50%        62.000000
75%       163.000000
max      7946.000000
Name: review, dtype: float64

In [12]:
long_reviews = df[df["review"].str.len() > 62]["review"]

# Basic gensim topic modeling

## Create a 2d array with the words in each document

In [13]:
documents_clean = long_reviews.str.replace("\.|\?|-|!|'|,", "")
documents_clean = documents_clean[documents_clean.str.len() > 50]
stoplist = list(gensim.parsing.preprocessing.STOPWORDS) + \
            ["hes", "think", "weve", "thats", "said", "want", "look", "youve", "youre"]
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents_clean]

## Remove words that only appear once

In [14]:
from collections import defaultdict
frequency =  defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

## Generate a dictionary (word to id mapping)

In [15]:
dictionary = gensim.corpora.Dictionary(texts)
print(dictionary)

Dictionary(8664 unique tokens: ['game', 'taught', 'diversity', 'cultures', 'small']...)


In [16]:
dictionary.filter_n_most_frequent(20)

## Generate the bag of words corpus

In [17]:
corpus = [dictionary.doc2bow(text) for text in texts]

## LDA Model

In [18]:
lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [19]:
top_words = [[word for word, _ in lda.show_topic(topicno, topn=50)] for topicno in range(lda.num_topics)]

In [20]:
for topics in top_words:
    print(" ".join(topics[:10]))

support carry enemy russians 1 3 gold lane 4 10
recommend definitely u low problems self mood :d valkyrieflight strategy
awesome language hate weight lol learning racism extreme subconsciously russian
heroes items new steam try hero im know different community
new learning moba curve amazing free recommend gameplay strategy player
learn hours russian recommend language nice try day ive know
cut u + amazing im know :) dota2 makes league
love better valve need new free lol hours heroes community
heroes russian items hours moba youll language free learn experience
league hours legends moba taught community recommend new russian free


In [160]:
doc = 20
print(lda.get_document_topics(corpus[doc]))
[print(dictionary[id]) for id, _ in corpus[doc]];

[(0, 0.014288354240350678), (1, 0.014289257614619936), (2, 0.014287534104115342), (3, 0.014286256067873048), (4, 0.014286761888813731), (5, 0.014290424627816264), (6, 0.8714051778973243), (7, 0.014287957221450008), (8, 0.01429167503625736), (9, 0.014286601301379379)]
debate
saying
things
let
absolutely
case
