#### Referência: machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/
### LDA in Python – How to grid search best topic models?

In [2]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# dataframe mais eficiente e com multithread
import polars
import pyarrow

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
#from pprint import pprint

# Plotting tools
#import pyLDAvis
#import pyLDAvis.sklearn
#import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
%%time
#https://stackoverflow.com/questions/40115043/no-space-left-on-device-error-while-fitting-sklearn-model
import os
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'

CPU times: user 18 µs, sys: 4 µs, total: 22 µs
Wall time: 27.9 µs


In [4]:
df = polars.read_csv('../Dados/steam_reviews_lemmatization.csv').lazy()
#df = df.filter(polars.col("title").is_in(["Terraria","Stardew Valley"]))
df = df.select(polars.col("lemmatized"))
df = df.rename({"lemmatized": "review"})

In [19]:
#df

In [5]:
import ast
#df.collect()
df = df.select(
        polars.col("review").apply(lambda x: " ".join(ast.literal_eval(x)))
)
#df.collect().head(5)

In [21]:
#df

In [22]:
%%time
''' 
def sent_to(x):
    preprocess = gensim.utils.simple_preprocess(str(x),deacc=True)
    return polars.Series(preprocess)


dataframe_words = df.select(polars.col("review").apply(lambda x: sent_to(x)))
'''
print()


CPU times: user 1.15 ms, sys: 386 µs, total: 1.54 ms
Wall time: 832 µs


In [23]:
#type(dataframe_words)

In [24]:
%%time
'''
DEBUG = 0
def lemmatization(text):
    """https://spacy.io/api/annotation"""
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    doc = nlp(text.str.concat(" ")[0])
    tmp = " ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags])
    return tmp
    
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = dataframe_words.select(
        polars.col("review").apply(lambda x:lemmatization(x))
)#.collect()
'''
print()


CPU times: user 91 µs, sys: 31 µs, total: 122 µs
Wall time: 128 µs


In [6]:
df = df.collect().to_pandas()
df

Unnamed: 0,review
0,played german reich declare war belgium break ...
1,yes
2,good game although bit overpriced opinion woul...
3,review write one probably serious one write st...
4,disclaimer survivor main play game fun competi...
...,...
399080,flesh rot decay.steel immortal.for omnissiah.t...
399081,domini dominae believe deal jewel rare elusive...
399082,first like x com style game likely like game l...
399083,disclaimer admech player tabletop review go co...


In [7]:
vectorizer = CountVectorizer(analyzer='word',       
                             #min_df=0.1,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             max_df=1.0,
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(df["review"])

In [8]:
no_top_words = 10
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(" ".join([feature_names[i]
                          for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [11]:
#Daqui não passa, out of memory (temp memory, hd)

# Build LDA Model
lda_model = LatentDirichletAllocation(              # Number of topics
                                    learning_method='online', 
                                    #max_ter=500,
                                    random_state=0,
                                    n_jobs = -1,# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
names = vectorizer.get_feature_names_out()

display_topics(lda_model,names,10)

#print()  # Model attributes

PicklingError: Could not pickle the task to send it to the workers.

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df =25, max_features=5000, use_idf=True)
tfidf = tfidf_vectorizer.fit_transform(df['review'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

Unnamed: 0,00,000,01,02,03,04,05,06,08,10,...,youtuber,youtubers,yr,yup,zerg,zergs,zero,zombie,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134812,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
399081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
399082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
399083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [10]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=10, random_state=0, init='nndsvd').fit(tfidf)
display_topics(nmf, tfidf_feature_names, no_top_words)

game get play like time buy make bad even player
good game pretty really job graphic friend need far yes
receive product free cool gg like skyrim gud ok fortnite
best ever game one play survival world battle royale far
nice game graphic really shot funny gameplay cool friend verry
great game friend recommend amazing graphic overall survival community need
10 would 11 get kill naked ign car rock simulator
fun friend play really game lot much pretty super friends
awesome game recommend amazing cool ing graphic addictive must survival
love game hate much amazing amaze absolutely really relationship cool


In [None]:
nmf_topic_values = nmf.transform(tfidf)
df['nmf_topics'] = nmf_topic_values.argmax(axis=1)
lda_topic_values = lda_model.transform(data_vectorized)
df['lda_topics'] = lda_topic_values.argmax(axis=1)

'''
lda_remap = {0: 'Good Design Processes', 1: 'Great Work Environment', 
             2: 'Flexible Work Hours', 3: 'Skill Building', 
             4: 'Difficult but Enjoyable Work', 5: 'Great Company/Job', 
             6: 'Care about Employees', 7: 'Great Contractor Pay', 
             8: 'Customer Service', 9: 'Unknown1'}
df['lda_topics'] = df['lda_topics'].map(lda_remap)
nmf_remap = {0: 'Fun Work Culture', 1: 'Design Process', 2: 'Enjoyable Job', 
             3: 'Difficult but Enjoyable Work', 4: 'Great Experience', 
             5: 'Perks', 6: 'Learning Opportunities', 7: 'Great Company/Job', 
             8: 'Contractor Employee Experience', 9: 'Management'}
df['nmf_topics'] = df['nmf_topics'].map(nmf_remap)
'''
df