In [1]:
import pandas as pd

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

import re

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
# df = pd.read_parquet("../Dados/tratados.parquet",columns=["lemmatized","title"])
df = pd.read_parquet("../Dados/tratados.parquet",columns=["review","title"])
# df = pd.read_csv('../Dados/steam_reviews.csv')

In [3]:
df

Unnamed: 0,review,title
0,Out of all the reviews I wrote This one is pro...,Dead by Daylight
1,Disclaimer I survivor main. I play games for f...,Dead by Daylight
2,Out of all the reviews I wrote This one is pro...,Dead by Daylight
3,I have never been told to kill myself more tha...,Dead by Daylight
4,Any longtime Dead by Daylight player knows tha...,Dead by Daylight
...,...,...
398586,Funny,Human: Fall Flat
398587,great game although my its annoying when your ...,Human: Fall Flat
398588,Amazingly fun ),Human: Fall Flat
398589,I would rate this game average.The game physic...,Human: Fall Flat


In [4]:
df = df[['review', 'title']]

In [5]:
df

Unnamed: 0,review,title
0,Out of all the reviews I wrote This one is pro...,Dead by Daylight
1,Disclaimer I survivor main. I play games for f...,Dead by Daylight
2,Out of all the reviews I wrote This one is pro...,Dead by Daylight
3,I have never been told to kill myself more tha...,Dead by Daylight
4,Any longtime Dead by Daylight player knows tha...,Dead by Daylight
...,...,...
398586,Funny,Human: Fall Flat
398587,great game although my its annoying when your ...,Human: Fall Flat
398588,Amazingly fun ),Human: Fall Flat
398589,I would rate this game average.The game physic...,Human: Fall Flat


In [6]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a 'bertopic.representation' model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

In [7]:
mask = df['review'].str.len() >=1
df = df.loc[mask]
df['review'] = df['review'].astype('str')

In [8]:
df_dbd = df.loc[df['title'] == 'Dead by Daylight']
docs = df_dbd['review'].to_list()

In [9]:
counts = df['title'].value_counts()
titulos_mais_de_10_reviews = counts[counts > 10].index.tolist()
df = df[df['title'].isin(titulos_mais_de_10_reviews)]

In [10]:
tmp = df.groupby(["title"])["title"].first()

for jogo in tmp:
    print(jogo)
    jogo_atual = df[df["title"] == jogo]
    
    # print(jogo_atual['review'].to_list())
    jogo = re.sub(r'[^\w\s]', '', jogo) # Remove pontuação
    jogo = re.sub(r'[^\x00-\x7F]+', '', jogo) # Remove caracteres especiais
    jogo = jogo.lower().strip()
    print(jogo)

    topics, probs = topic_model.fit_transform(jogo_atual['review'].to_list())
    topic_model.save("../Dados/bertopics/modelo_" + jogo)


ARK: Survival Evolved
ark survival evolved
ASTRONEER
astroneer
Dead by Daylight
dead by daylight


  self._set_arrayXarray(i, j, x)


Divinity: Original Sin 2 - Definitive Edition
divinity original sin 2  definitive edition
Don't Starve Together
dont starve together
Euro Truck Simulator 2
euro truck simulator 2
Factorio
factorio
Garry's Mod
garrys mod
Grand Theft Auto V
grand theft auto v


  self._set_arrayXarray(i, j, x)


Human: Fall Flat
human fall flat
Insurgency: Sandstorm
insurgency sandstorm
Left 4 Dead 2
left 4 dead 2
MONSTER HUNTER: WORLD
monster hunter world


  self._set_arrayXarray(i, j, x)


PLAYERUNKNOWN'S BATTLEGROUNDS
playerunknowns battlegrounds


  self._set_arrayXarray(i, j, x)


RESIDENT EVIL 2
resident evil 2
RimWorld
rimworld
Rocket League®
rocket league


  self._set_arrayXarray(i, j, x)


Rust
rust


  self._set_arrayXarray(i, j, x)


Sid Meier’s Civilization® VI
sid meiers civilization vi
Slay the Spire
slay the spire
Stardew Valley
stardew valley
Subnautica
subnautica
Terraria
terraria
The Elder Scrolls V: Skyrim Special Edition
the elder scrolls v skyrim special edition
