# LDA Modelling - 3rd Run

This model uses fewer types of words, restricting word types to either:

1. n-grams and nouns only, or
2. n-grams, nouns, and verbs (newly added during deployment)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
from nltk import WordNetLemmatizer
import spacy
import re
import bbcode
import json

import gensim
from gensim.test.utils import datapath
from gensim import corpora, models, similarities
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS

import pyLDAvis
import pyLDAvis.gensim as p_gensim

import os
import pathlib
%matplotlib inline

unable to import 'smart_open.gcs', disabling that module


In [27]:
final_df = pd.read_csv('./dataframes/final_df.csv',index_col=0)

In [28]:
final_df = final_df[['timestamp_created','review','clean_reviews','2gram_reviews','3gram_reviews']]
final_df.head()

Unnamed: 0,timestamp_created,review,clean_reviews,2gram_reviews,3gram_reviews
0,1586657138,"Online review only...Simply put, it sucks ass....","['online', 'review', 'simply', 'sucks', 'ass',...","['online', 'review', 'simply', 'suck', 'ass', ...","['online', 'review', 'simply', 'suck', 'ass', ..."
1,1586656937,It's a wonderful multiplayer and singleplayer ...,"['wonderful', 'multiplayer', 'singleplayer', '...","['wonderful', 'multiplayer', 'singleplayer', '...","['wonderful', 'multiplayer', 'singleplayer', '..."
2,1586656864,People claim a lot in reviews that they cant r...,"['people', 'claim', 'reviews', 'run', 'works',...","['people', 'claim', 'review', 'run', 'works_fi...","['people', 'claim', 'review', 'run', 'works_fi..."
3,1586655683,"Best fucking story I have ever played, online'...","['best', 'fucking', 'story', 'played', 'online...","['best', 'fucking', 'story', 'play', 'online',...","['best', 'fucking', 'story', 'play', 'online',..."
4,1586655188,well made game lots of fun,"['lots', 'fun']",['lots_fun'],['lots_fun']


In [29]:
final_df['clean_reviews'] = final_df['clean_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) #n-grams underscores must be preserved for readability
final_df['2gram_reviews'] = final_df['2gram_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) 
final_df['3gram_reviews'] = final_df['3gram_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) 
#Reading in the DF from a CSV turned the list of words in each cell into string, so we have to remove the punctuation and split them again to get lists of terms

In [30]:
final_df.head()

Unnamed: 0,timestamp_created,review,clean_reviews,2gram_reviews,3gram_reviews
0,1586657138,"Online review only...Simply put, it sucks ass....","[online, review, simply, sucks, ass, glitches,...","[online, review, simply, suck, ass, glitche, m...","[online, review, simply, suck, ass, glitche, m..."
1,1586656937,It's a wonderful multiplayer and singleplayer ...,"[wonderful, multiplayer, singleplayer, fix, aw...","[wonderful, multiplayer, singleplayer, fix, aw...","[wonderful, multiplayer, singleplayer, fix, aw..."
2,1586656864,People claim a lot in reviews that they cant r...,"[people, claim, reviews, run, works, fine, tim...","[people, claim, review, run, works_fine, time,...","[people, claim, review, run, works_fine, time,..."
3,1586655683,"Best fucking story I have ever played, online'...","[best, fucking, story, played, online, cool, w...","[best, fucking, story, play, online, cool, wor...","[best, fucking, story, play, online, cool, wor..."
4,1586655188,well made game lots of fun,"[lots, fun]",[lots_fun],[lots_fun]


In [6]:
final_df.dtypes

timestamp_created     int64
review               object
clean_reviews        object
2gram_reviews        object
3gram_reviews        object
dtype: object

In [31]:
# Text Cleaning Redux

nlp = spacy.load("en_core_web_sm")
parser = bbcode.Parser()

#expand contractions
with open('./en_contractions/contra_dict.txt') as contra_dict:
    cList = json.load(contra_dict)

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text.lower())

#convert numbers to words

num_dict = {'0':'zero',
            '1':'one',
            '2':'two',
            '3':'three',
            #'i':'one',      skipped. Using Roman numeral 'i' will conflict with the pronoun "I", which is not a number
            'ii':'two',
            'iii':'three'   
            }

def num2word(d):
    
    if (len(d) == 1 and d in '0123')  or (d in ['ii','iii']):
        word = num_dict[d]
    
    elif (len(str(d))==1 and str(d) in '0123'):
        word = num_dict(str(d))
    
    else:
        word = d
    
    return word

#define stopwords

en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i'])
en_stopwords = [w for w in en_stopwords if w not in ['one','two','three']]    #retain these for making n-grams, then remove afterwards

def remove_stopwords(doc):
    words = [num2word(w) for w in doc if w != '' and w not in en_stopwords]
    return words


#combine cleaning functions into one function
def parse_clean(text):
    parsed_text = parser.strip(text) #remove BBcode notations from text
    
    text = expandContractions(parsed_text) #expand contractions; return all text in lower case
    
    text = re.split(r'\W+',text) #separate words from punctuation (e.g. remove "'s" from "Cao Cao's")
    
    text = [num2word(w) for w in text] #convert single digits to words before word len check, or they will be lost
    
    #All word lengths should be >1 character and <= length of the longest word in the English language. It's common for people spam incoherent letters on the Internet.
    text = [word for word in text if word not in en_stopwords and len(word)>1 and len(word) <= len('pneumonoultramicroscopicsilicovolcanoconiosis')] 
    
    clean_text = [num2word(w) for w in text] #just in case any lone numbers appeared after cleaning
    
    return clean_text


def stop_clean(texts):
    texts = [parse_clean(doc) for doc in texts]
    texts = [remove_stopwords(doc) for doc in texts] #just in case, remove stopwords one more time
    
    return texts

#at this point we will make n-grams, then lemmatise using spacy since it can go by permitted postags

def spacy_lemma(bow,allowed_postags=['NOUN']): #can add any from https://spacy.io/api/annotation#pos-tagging but will be sticking to NOUN, VERB, ADJ
    
    lemma_doc = nlp(" ".join(bow)) 

    lemma_text = [token.text if '_' in token.text else token.lemma_ if token.pos_ in allowed_postags else '' for token in lemma_doc]
    
    return lemma_text

In [32]:
model2_df = final_df[['timestamp_created','review']]
model2_df.head()

Unnamed: 0,timestamp_created,review
0,1586657138,"Online review only...Simply put, it sucks ass...."
1,1586656937,It's a wonderful multiplayer and singleplayer ...
2,1586656864,People claim a lot in reviews that they cant r...
3,1586655683,"Best fucking story I have ever played, online'..."
4,1586655188,well made game lots of fun


In [33]:
model2_df['clean_reviews'] = stop_clean(model2_df['review'])
model2_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,timestamp_created,review,clean_reviews
0,1586657138,"Online review only...Simply put, it sucks ass....","[online, review, simply, sucks, ass, glitches,..."
1,1586656937,It's a wonderful multiplayer and singleplayer ...,"[wonderful, multiplayer, singleplayer, fix, aw..."
2,1586656864,People claim a lot in reviews that they cant r...,"[people, claim, reviews, run, works, fine, tim..."
3,1586655683,"Best fucking story I have ever played, online'...","[best, fucking, story, played, online, cool, w..."
4,1586655188,well made game lots of fun,"[lots, fun]"


In [34]:
#Now that the cleaner text is ready (only or mostly nouns), we create n-grams again

# Credit to https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/ for the n-grams code

# Build the bigram and trigram models
bigram = gensim.models.Phrases(list(model2_df['clean_reviews']), min_count=5, threshold=10) # feed a list of lists of words e.g. [['word1','word2'],['word3','word4'] to get bigrams]
trigram = gensim.models.Phrases(bigram[list(model2_df['clean_reviews'])], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [35]:
model2_df['3gram_reviews'] = make_trigrams(model2_df['clean_reviews'])
model2_df['3grams_nouns'] = model2_df['3gram_reviews'].map(lambda x: spacy_lemma(x))
model2_df['3grams_nouns_verbs'] = model2_df['3gram_reviews'].map(lambda x: spacy_lemma(x,allowed_postags=['NOUN','VERB']))
model2_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,timestamp_created,review,clean_reviews,3gram_reviews,3grams_nouns,3grams_nouns_verbs
0,1586657138,"Online review only...Simply put, it sucks ass....","[online, review, simply, sucks, ass, glitches,...","[online, review, simply, sucks, ass, glitches,...","[, review, , , ass, , minute, camp, , , bug, ,...","[, review, , suck, ass, glitche, minute, camp,..."
1,1586656937,It's a wonderful multiplayer and singleplayer ...,"[wonderful, multiplayer, singleplayer, fix, aw...","[wonderful, multiplayer, singleplayer, fix, aw...","[, , , , , connection_issues, , , , , friend]","[, , , , , connection_issues, recommend, , rec..."
2,1586656864,People claim a lot in reviews that they cant r...,"[people, claim, reviews, run, works, fine, tim...","[people, claim, reviews, run, works_fine, time...","[people, , review, , works_fine, time, , pc, ,...","[people, claim, review, run, works_fine, time,..."
3,1586655683,"Best fucking story I have ever played, online'...","[best, fucking, story, played, online, cool, w...","[best, fucking, story, played, online, cool, w...","[, , story, , , , , , ]","[, , story, play, , , , , ]"
4,1586655188,well made game lots of fun,"[lots, fun]",[lots_fun],[lots_fun],[lots_fun]


In [36]:
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three']) #after making n-grams, removing numbers should reduce noise

model2_df['3grams_nouns'] = model2_df['3grams_nouns'].map(lambda x: remove_stopwords(x)) #removes blanks as well
model2_df['3grams_nouns_verbs'] = model2_df['3grams_nouns_verbs'].map(lambda x: remove_stopwords(x)) 

In [37]:
model2_df.head()

Unnamed: 0,timestamp_created,review,clean_reviews,3gram_reviews,3grams_nouns,3grams_nouns_verbs
0,1586657138,"Online review only...Simply put, it sucks ass....","[online, review, simply, sucks, ass, glitches,...","[online, review, simply, sucks, ass, glitches,...","[review, ass, minute, camp, bug, money, infini...","[review, suck, ass, glitche, minute, camp, bug..."
1,1586656937,It's a wonderful multiplayer and singleplayer ...,"[wonderful, multiplayer, singleplayer, fix, aw...","[wonderful, multiplayer, singleplayer, fix, aw...","[connection_issues, friend]","[connection_issues, recommend, recommend, friend]"
2,1586656864,People claim a lot in reviews that they cant r...,"[people, claim, reviews, run, works, fine, tim...","[people, claim, reviews, run, works_fine, time...","[people, review, works_fine, time, pc, spec, t...","[people, claim, review, run, works_fine, time,..."
3,1586655683,"Best fucking story I have ever played, online'...","[best, fucking, story, played, online, cool, w...","[best, fucking, story, played, online, cool, w...",[story],"[story, play]"
4,1586655188,well made game lots of fun,"[lots, fun]",[lots_fun],[lots_fun],[lots_fun]


# LDA Model - 3grams - Nouns Only

In [38]:
#build dictionary and corpus from 3gram dataset, NOUNS only with filter_extremes()

documents = list(model2_df['3grams_nouns'])
dictionary = gensim.corpora.Dictionary(documents)
dictionary.filter_extremes(no_below = 5,no_above=0.5) #trying with default settings
corpus = [dictionary.doc2bow(word) for word in documents]

In [39]:
# LDA model parameters -- we will go with just 5 topics to keep the model more generalised
num_topics = 4
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [41]:
#NOTE: LDAMultiCore; set workers = n-1 (where n is your number of cores)

%time ldamodel1 = LdaMulticore(corpus, num_topics=num_topics, id2word = dictionary, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3)

# Check resulting topics.
topic_list = ldamodel1.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 1min 3s
pc issue time crash rockstar play problem hour graphic story setting steam fix bug performance
story character time _ graphic world thing mission open_world people red_dead_redemption_two experience horse gameplay player
time horse problem rockstar mission thing control player play pc people character experience weapon reason
bug single_player mission server hacker hour horse camp money fun time thing player multiplayer friend
mission way hour time world story graphic thing fun love movie story_line wild_west bit end


In [42]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel1.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=ldamodel1, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('\nCoherence Score: ', coherence_lda1)


Perplexity:  -6.634830098836103

Coherence Score:  0.6153871904089758


In [43]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel1, corpus, dictionary)
vis

In [26]:
LDA_fin = vis
pyLDAvis.save_html(LDA_fin, './viz/lda_fin_all.html')

In [20]:
#save the model

newpath = './models/nouns_only/model1' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel1.save('./models/nouns_only/model1/model1.model')

#Usable model with good topics! Very similar to topics found in first run and arguably better!

### Preliminary Review

This model looks much more usable than previous ones.

Coherence score is higher as well. I'm starting to see some of the coherent topics I previously identified as well.

We can further refine the nouns-only model by removing more stopwords.

Changes to the approach:

1. Cleaning
    - Removed everything other than Nouns and n-grams
    - Used filter_extremes() with default settings (no_below = 5, no_above = 0.5)
   
2. Modelling
    - Decided to use 5 topics only, because I am only looking for 5 general topics (for potential use in a later model that can generalise across the series and potentially the whole strategy genre... or at least the Total War series)
    
    
Changes to Results:
1. 1.5x the Coherence score of previous models (~0.53 coherence! Previous models had ~ 0.38 coherence even with ~10 topics!)
2. 5 topics that seem very coherent

# Fine-tuning

Attempting to refine the results by removing more terms that may be generating noise

In [44]:
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three','thing','bit','total_war','time','10_10','love','fun','play','hour']) #after making bigrams, removing numbers should reduce noise
model2_df['3grams_nouns_v2'] = model2_df['3grams_nouns'].map(lambda x: remove_stopwords(x))

In [45]:
#build dictionary and corpus from 3gram dataset -- this time with filter_extremes

documents3 = list(model2_df['3grams_nouns_v2'])
dictionary3 = gensim.corpora.Dictionary(documents3)
dictionary3.filter_extremes(no_below = 5,no_above=0.5)
corpus3 = [dictionary3.doc2bow(word) for word in documents3]

In [46]:
# LDA model parameters.
num_topics = 4
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [47]:
#Beware, LDA has some randomness to it if you do not set a random_state. May not produce high-coherence model every time.

%time ldamodel3 = LdaMulticore(corpus3, num_topics=num_topics, id2word = dictionary3, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3,random_state = 14180)

# Check resulting topics.
topic_list = ldamodel3.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 50.3 s
story graphic rockstar character experience gameplay bug world open_world single_player story_mode red_dead_redemption_two mission pc year
pc crash issue problem setting steam rockstar performance fix run optimization error launch graphic rockstar_launcher
horse way people mission character story gun weapon world animal control man player action animation
mission player server bug rockstar money hacker friend camp problem single_player people session gold online_mode


In [48]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel3.log_perplexity(corpus3))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda3 = CoherenceModel(model=ldamodel3, texts=documents3, dictionary=dictionary3, coherence='c_v')
coherence_lda3 = coherence_model_lda3.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.634936558205696

Coherence Score:  0.6424500778278636


In [49]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel3, corpus3, dictionary3)
vis

In [50]:
#save visualisation of LDA model
LDA_fin = vis
pyLDAvis.save_html(LDA_fin, './viz/lda_fin_4.html')

In [51]:
#saving as model3 because I will call model2 from a saved file
newpath = './models/nouns_only/model3' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel3.save('./models/nouns_only/model3/model3.model')

# Randomness in LDA

Due to the randomness in LDA, topics identified are likely to differ each time it is run.

For consistent results, best to include random_state when training the model. (Will do this for future iterations)

The next cell will load a pre-trained model that was used for the remainder of the project.

It has the topics that are general and coherent, and which best fit the purpose of this project.

### This Model

Topics are reasonably coherent, but more suitable for another purpose.

1. Topic 1 - Game Features/Strategic Gameplay (Multiple topics)
2. Topic 2 - Series Authenticity
3. Topic 3 - Tactical Gameplay
4. Topic 4 - Characters
5. Topic 5 - Downloadable Content and Patches

In [28]:
#set up the saved model

lda_saved = LdaMulticore.load('./models/nouns_only/saved_model/saved.model')

documents_saved = list(model2_df['3grams_nouns_v2'])

dictionary_saved = gensim.corpora.Dictionary.load('./models/nouns_only/saved_model/saved.model.id2word')

#dictionary_saved.filter_extremes(no_below = 5,no_above=0.5) #default settings were used when training this model

corpus_saved = [dictionary_saved.doc2bow(doc) for doc in documents_saved]

In [29]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(lda_saved, corpus_saved, dictionary_saved)
vis

# Topics Identified

The nouns-only iterations have clear topic separation between all topics, and the saved one from a previous iteration has coherent topics.

1. **Content & Authenticity**</br>
    - (Topic 2) Game content and its authenticity to the Total War series
    - Contains comparisons to other titles in the Total War series (e.g. shogun_two --> Shogun 2, one of the most well-received installments in the series)
    - Important because target market is full of hardcore history/ROTK buffs 
    - Total War has also carved a nice niche for itself in the game industry - the only closest competitor is Ultimate General, created by a Total War modder </br> </br>

2. **Strategic Gameplay**</br>
    - (Topic 4) Gameplay on the strategic scale, e.g. movements on the campaign map, city management, diplomacy, negotiations
    - Inclusive of new features e.g. revamped Diplomacy, Romance/Records mode selection, Faction playstyles </br> </br>

3. **Tactical Gameplay**</br>
    - (Topic 1) Gameplay of individual battles, managing generals' equipment and skills, their retinues, individual units, managing individual cities, etc </br> </br>

4. **Characters**</br>
    - (Topic 3) Characters in the game and their behaviours.
    - Game is based on both Records and Romance of the Three Kingdoms, historical records (Chen Shou, ~300 AD) and a novel (Luo Guanzhong, 14th Century AD) respectively. 
    - Strong overlap with Topic 1.
    - Draws comparisons to Koei Tecmo's Dynasty Warriors series based on the same source material because of Romance Mode, where generals are lone units capable of incredible feats in battle. </br> </br>

5. **UI/UX, Performance, Stability**</br>
    - (Topic 5) Bugs, crashes, and fixes are terms relevant to this topic and they often come with games that work on massive scales like Total War.
    - General performance of the game (each player uses different PC specifications)
    - Bugs are always a concern and are virtually unavoidable for games, especially close to release.
    - Examples include bugs affecting random crashes in the middle of the game and [crashes involving Liu Bei's annexation ability when the game was first released](https://steamcommunity.com/app/779340/discussions/0/1642038749328500806/).

The corrections made from previous versions of the model seem to have had a tremendously positive effect on getting coherent topics out of this model.

In [52]:
# Prepare Model 3
# set up Model 3

lda_saved = LdaMulticore.load('./models/nouns_only/model3/model3.model')

documents_saved = list(model2_df['3grams_nouns_v2'])

dictionary_saved = gensim.corpora.Dictionary.load('./models/nouns_only/model3/model3.model.id2word')

#dictionary_saved.filter_extremes(no_below = 5,no_above=0.5) #default settings were used when training this model

corpus_saved = [dictionary_saved.doc2bow(doc) for doc in documents_saved]

In [54]:
#save visualisation of LDA model
LDA_fin = pyLDAvis.gensim.prepare(lda_saved, corpus_saved, dictionary_saved)
pyLDAvis.save_html(LDA_fin, './viz/lda_fin_4.html')

In [34]:
# Trying out nouns and verbs
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three','thing','bit','total_war','time','10_10','love','fun','play','hour']) 
model2_df['3grams_nouns_verbs_v2'] = model2_df['3grams_nouns_verbs'].map(lambda x: remove_stopwords(x))

In [35]:
#build dictionary and corpus from 3gram dataset -- this time with filter_extremes

documents4 = list(model2_df['3grams_nouns_verbs_v2'])
dictionary4 = gensim.corpora.Dictionary(documents4)
dictionary4.filter_extremes(no_below = 5,no_above=0.5)
corpus4 = [dictionary4.doc2bow(word) for word in documents4]

In [36]:
# LDA model parameters.
num_topics = 5
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [37]:
#Beware, LDA has some randomness to it if you do not set a random_state. May not produce high-coherence model every time.

seed = np.random.randint(0,999999)
print("Seed:", seed,"\n")
%time ldamodel4 = LdaMulticore(corpus4, num_topics=num_topics, id2word = dictionary4, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3,random_state = seed)

# Check resulting topics.
topic_list = ldamodel4.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Seed: 97892 

Wall time: 2min 26s
story graphic pc run feel character gameplay experience world look rockstar buy open_world setting need
mission story player single_player experience bug rockstar want server character online_mode hacker friend story_mode people
crash fix issue work problem run bug rockstar pc buy try start launch update steam
mission try horse simulator start rockstar action mean cutscene want man fail level story hat
horse way want people kill mission animal start save ride know shoot hunt gun die


In [38]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel4.log_perplexity(corpus4))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda4 = CoherenceModel(model=ldamodel4, texts=documents4, dictionary=dictionary4, coherence='c_v')
coherence_lda4 = coherence_model_lda4.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.821809257990424

Coherence Score:  0.6191002779108389


In [39]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel4, corpus4, dictionary4)
vis

In [40]:
# LDA model parameters.
num_topics = 7
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [41]:
#Beware, LDA has some randomness to it if you do not set a random_state. May not produce high-coherence model every time.

seed = np.random.randint(0,999999)
print("Seed:", seed,"\n")
%time ldamodel4 = LdaMulticore(corpus4, num_topics=num_topics, id2word = dictionary4, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3,random_state = seed)

# Check resulting topics.
topic_list = ldamodel4.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Seed: 588439 

Wall time: 3min 16s
story graphic crash buy pc gameplay feel experience recommend open_world review single_player bug mission year
fix rockstar work issue crash bug problem try pc start update server buy error steam
cowboy want rockstar suck launcher buy need steam fuck social_club wild_west pay rockstar_launcher expect simulator
run setting performance crash look issue pc graphic problem _fps optimization need set optimize stutter
story character world graphic red_dead_redemption_two experience arthur_morgan masterpiece rockstar story_mode end feel red_dead_redemption life want
mission horse player people want way feel kill animal try shoot hunt camp character story
way know man life come enjoy want feel rdr witcher_three day buy kill hunt release


In [42]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel4.log_perplexity(corpus4))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda4 = CoherenceModel(model=ldamodel4, texts=documents4, dictionary=dictionary4, coherence='c_v')
coherence_lda4 = coherence_model_lda4.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.8479766522796535

Coherence Score:  0.6191002779108389


In [43]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel4, corpus4, dictionary4)
vis

In [44]:
#saving this model that uses nouns and verbs, although the results are not as good as the saved model
newpath = './models/nouns_verbs/model4' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel4.save('./models/nouns_verbs/model4/model4.model')

In [55]:
model2_df.to_csv('./dataframes/model2_df.csv')

### References

https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/