In [1]:
# Import gensim LDA package
import gensim
from gensim import corpora, models, similarities
from gensim.models.ldamodel import LdaModel
# Import nltk for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
# Import numpy and regular expressions for text manipulations
import re
import numpy as np
# Import math for logarithmic calculation
import math
import encodings
import pandas as pd

In [18]:
# Use the porter stemming method for stemming words
stemmer=PorterStemmer()

# Import the stop word list built into the NLTK
stop = nltk.corpus.stopwords.words('english')
# Special word additions
letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
special_char = ['~~','``',"''", '--',"'ll","n't","'re","'s","'d","term", 'it']
# Add to stopwords list
stop = np.append(stop, letters)
stop = np.append(stop, special_char)
# Print the stopword list
#print stop

# Removes stop words from the given file and returns a list of words without white spaces, numeric values, and symbols
def removestop(text, stop):
    nostop =  [i for i in text.lower().split() if i not in stop]
    
    cleanstop = []
    for word in nostop:
        new = re.sub("[^A-Za-z]", "", word)
        if new != '':
            cleanstop.append(new)
    return cleanstop
    
# Stems words based on the stemming method chosen above. Returns a list of stemmed words.
def stemwords(text):
    stemresult = []
    for plural in text:
        stemresult.append(stemmer.stem(plural))
    return stemresult

[u'i' u'me' u'my' u'myself' u'we' u'our' u'ours' u'ourselves' u'you'
 u'your' u'yours' u'yourself' u'yourselves' u'he' u'him' u'his' u'himself'
 u'she' u'her' u'hers' u'herself' u'it' u'its' u'itself' u'they' u'them'
 u'their' u'theirs' u'themselves' u'what' u'which' u'who' u'whom' u'this'
 u'that' u'these' u'those' u'am' u'is' u'are' u'was' u'were' u'be' u'been'
 u'being' u'have' u'has' u'had' u'having' u'do' u'does' u'did' u'doing'
 u'a' u'an' u'the' u'and' u'but' u'if' u'or' u'because' u'as' u'until'
 u'while' u'of' u'at' u'by' u'for' u'with' u'about' u'against' u'between'
 u'into' u'through' u'during' u'before' u'after' u'above' u'below' u'to'
 u'from' u'up' u'down' u'in' u'out' u'on' u'off' u'over' u'under' u'again'
 u'further' u'then' u'once' u'here' u'there' u'when' u'where' u'why' u'how'
 u'all' u'any' u'both' u'each' u'few' u'more' u'most' u'other' u'some'
 u'such' u'no' u'nor' u'not' u'only' u'own' u'same' u'so' u'than' u'too'
 u'very' u's' u't' u'can' u'will' u'just' u'don' 

In [19]:
# Read the main results file
test = pd.read_csv("results.csv", encoding='utf-8')
# Filter on review name
total = test[test['reviewer'] == "Brad Shoemaker"]
# Reset index based on that writer's number of reviews
total.index = range(len(total))

In [20]:
# Create a dictionary to store the multiple lists 
text = dict()
# Begin loop for all reviews
for i in range(0,len(total)):
    # Import and read text file
    example = total.review[i]
    # Strip out stopwords and white spaces
    x = removestop(example, stop)
    # Stem words
    y = stemwords(x)
    # Copy the list of stemmed words into the dictionary
    text[i] = y

# Create an array to put all the lists 
patent_array = []

# Begin loop for the dictionary
for i in range(0,len(total)):
    # Convert lists into arrays
    doctext = np.asarray(text[i]).reshape(-1, 1)
    # Place the arrays into one array
    patent_array.append(doctext)   

In [21]:
# Review count
len(patent_array)

130

In [22]:
# Create dictionary of terms
dictionary = corpora.Dictionary(patent_array)
# Store the dictionary, for future reference
dictionary.save('videogames.dict')
# Create the corpus of literature
corpus = [dictionary.doc2bow(document) for document in patent_array]

In [23]:
# Setup LDA model
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=10)

In [24]:
# Print topics
n = 1
for i in lda.show_topics(num_topics=10, num_words=10):
    print "Topic " + str(n)
    for x in i.split(' + '):
        print x
    n += 1
    print

Topic 1
0.012*g   a   m   e                                                   
0.005*g   e   t                                                       
0.004*i   t                                                           
0.004*l   i   k   e                                                   
0.003*m   a   k   e                                                   
0.003*o   n   e                                                       
0.003*c   o   n   t   r   o   l                                       
0.003*p   l   a   y                                                   
0.003*t   i   m   e                                                   
0.003*t   h   e   r   e                                               

Topic 2
0.017*g   a   m   e                                                   
0.008*i   t                                                           
0.006*o   n   e                                                       
0.005*g   e   t                                             