In [1]:
import pandas as pd
df = pd.read_csv("steam_reviews.csv")

print(df.head())

   app_id        app_name                                        review_text  \
0      10  Counter-Strike                                    Ruined my life.   
1      10  Counter-Strike  This will be more of a ''my experience with th...   
2      10  Counter-Strike                      This game saved my virginity.   
3      10  Counter-Strike  • Do you like original games? • Do you like ga...   
4      10  Counter-Strike           Easy to learn, hard to master.             

   review_score  review_votes  
0             1             0  
1             1             1  
2             1             0  
3             1             0  
4             1             1  


In [None]:

#games = ["Dota 2", "Skyrim", "Witcher 3", "Counter-Strike: Global Offensive", "Call of Duty", "Doom", "NBA", "Rocket League", "Football Manager"]
games = [
    #RPG
    "Dota 2",
    "The Elder Scrolls V: Skyrim",
    "The Witcher 3: Wild Hunt",

    #FPS
    "Call of Duty: Modern Warfare 3",
    "Counter-Strike",
    "DOOM",

    #Sports
    "NBA 2K16",
    "Rocket League",
    "Football Manager 2016"
    ]

df_filtered = df[df['app_name'].isin(games)]

print(df_filtered)

In [None]:
!pip install nltk gensim pyLDAvis

In [None]:
# lda model to get the frequent words per topic
import nltk
import re
import pandas as pd
import gensim
import pyLDAvis
import pyLDAvis.gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Custom stopword list based on frequency graph
stop_list = nltk.corpus.stopwords.words('english')
stop_list += [
    "game", "play", "like", "good", "best", "one", "great", "really", "get", "time",
    "ever", "played", "playing", "would", "even", "much", "hours", "love", "still",
    "buy", "amazing", "in", "u", "it", "xd", "lol", "ive", "im", "could", "also", "many",
    "im", "lol", "cant", "ing"
]

# Function to preprocess and tokenize reviews
def preprocess_reviews(df, text_column='review_text'):
    """ Tokenizes, removes stopwords, and cleans review text. """
    # Drop missing values in the review_text column
    df = df.dropna(subset=[text_column])

    # Tokenize and clean reviews
    docs1 = df[text_column].astype(str).apply(nltk.word_tokenize).tolist()

    # Lowercasing, removing non-alphabetic tokens, and stopword removal
    docs2 = [[w.lower() for w in doc] for doc in docs1]
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2]
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3]

    return docs4

# Convert DataFrame review column into processed documents
docs = preprocess_reviews(df_filtered, text_column='review_text')

# Create Dictionary
dictionary = corpora.Dictionary(docs)

# Convert documents into Bag of Words representation
corpus_bow = [dictionary.doc2bow(doc) for doc in docs]

# Train LDA Model
num_topics = 10  # Adjust as needed
lda_model = LdaModel(corpus=corpus_bow, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

# Print topics
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}: {topic}")

# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score_LDA = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score_LDA}")

# Visualize LDA Topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_bow, dictionary)
vis

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import CoherenceModel
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Remove rows with NaN values in the 'review_text' column
df_filtered = df_filtered.dropna(subset=['review_text'])

# Apply TF-IDF to the review text
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_filtered['review_text'])

# Apply NMF with 10 topics
nmf_model = NMF(n_components=10, random_state=42)
W = nmf_model.fit_transform(X)
H = nmf_model.components_

# Extract top words for each topic from NMF
def get_top_words_for_nmf(model, n_words=10):
    """Extract top n words for each topic in an NMF model."""
    feature_names = vectorizer.get_feature_names_out()
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_words-1:-1]
        top_words.append([feature_names[i] for i in top_words_idx])
    return top_words

# Get top words for each NMF topic
top_words_nmf = get_top_words_for_nmf(nmf_model)

# Compute Coherence Score for NMF
# For NMF, we will use Gensim's CoherenceModel, but need to create a custom list of top words
# 'docs' is the preprocessed text from your dataset
coherence_model_nmf = CoherenceModel(topics=top_words_nmf, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score_nmf = coherence_model_nmf.get_coherence()
print(f"Coherence Score for NMF: {coherence_score_nmf}")

# Print topics for NMF
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words_idx = topic.argsort()[:-11:-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import CoherenceModel
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Apply TF-IDF to the review text
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_filtered['review_text'])

# Apply LSA (Truncated SVD) with 10 topics
lsa_model = TruncatedSVD(n_components=10, random_state=42)
lsa_topics = lsa_model.fit_transform(X)

# Extract top words for each topic from LSA
def get_top_words_for_lsa(model, n_words=10):
    """Extract top n words for each topic in an LSA model."""
    feature_names = vectorizer.get_feature_names_out()
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_words-1:-1]
        top_words.append([feature_names[i] for i in top_words_idx])
    return top_words

# Get top words for each LSA topic
top_words_lsa = get_top_words_for_lsa(lsa_model)

# Compute Coherence Score for LSA
# For LSA, we will use Gensim's CoherenceModel, but need to create a custom list of top words
# 'docs' is the preprocessed text from your dataset
coherence_model_lsa = CoherenceModel(topics=top_words_lsa, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score_lsa = coherence_model_lsa.get_coherence()
print(f"Coherence Score for LSA: {coherence_score_lsa}")

# Print topics for LSA
for topic_idx, topic in enumerate(lsa_model.components_):
    top_words_idx = topic.argsort()[:-11:-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

In [None]:
# Install necessary libraries if not already installed
# !pip install transformers
# !pip install sentence-transformers
# !pip install sklearn

import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer

# Load BERT tokenizer and model from HuggingFace
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Alternatively, you can use a Sentence Transformer model for better document embeddings
# model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to convert text to embeddings using BERT
def get_bert_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            # Take the mean of all token embeddings (pooling)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            embeddings.append(embedding)
    return np.array(embeddings)

# Use the reviews from your DataFrame
texts = df_filtered['review_text'].dropna().tolist()

# Generate embeddings for all the reviews using BERT
embeddings = get_bert_embeddings(texts, model, tokenizer)

# Optional: Reduce dimensionality for easier visualization (e.g., PCA for 2D visualization)
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Apply KMeans clustering to the BERT embeddings
num_topics = 10  # Adjust the number of topics as necessary
kmeans = KMeans(n_clusters=num_topics, random_state=42)
kmeans.fit(embeddings)

# Get the topic/cluster assignments for each document
topic_assignments = kmeans.labels_

# Optionally, calculate the silhouette score for the clustering quality
silhouette_avg = silhouette_score(embeddings, topic_assignments)
print(f'Silhouette Score: {silhouette_avg}')

# Visualize the clusters (Optional: PCA reduction to 2D for visualization)
plt.figure(figsize=(8, 6))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topic_assignments, cmap='viridis')
plt.colorbar()
plt.title("BERT-based Clusters (PCA reduced to 2D)")
plt.show()

# Optionally: Print the top words for each topic
# To find the top words for each cluster, we can get the most frequent terms in each cluster using the BERT embeddings
# However, BERT itself doesn't produce explicit words for topics, so we'll just visualize the clusters.

# You can also create word clouds or extract top words per cluster by clustering the terms (this is more advanced).

# To print out the topic assignments for each document:
df_filtered['topic'] = topic_assignments
print(df_filtered[['review_text', 'topic']].head())

# Optional: If you want to use SentenceTransformers to generate better embeddings
# model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = model.encode(texts)


In [None]:
from sklearn.decomposition import TruncatedSVD

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_filtered['review_text'])

# Apply LSA (SVD)
lsa_model = TruncatedSVD(n_components=10, random_state=42)
lsa_topic_matrix = lsa_model.fit_transform(X)

# Print topics
terms = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lsa_model.components_):
    print(f"Topic {topic_idx}:")
    print([terms[i] for i in topic.argsort()[:-11:-1]])


In [None]:
# visualise coherence score based on number of topics
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel

# Function to compute coherence score for different topic numbers
def compute_coherence_values(dictionary, corpus, texts, start=2, limit=15, step=1):
    coherence_values = []
    topic_range = range(start, limit, step)

    for num_topics in topic_range:
        # Train LDA model with num_topics
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

        # Compute coherence score
        coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())

    return topic_range, coherence_values

# Define topic range and compute coherence scores
topic_range, coherence_values = compute_coherence_values(dictionary, corpus_bow, docs, start=2, limit=15, step=1)

# Plot Coherence Score graph
plt.figure(figsize=(10, 5))
plt.plot(topic_range, coherence_values, marker='o', linestyle='-')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Score vs. Number of Topics")
plt.grid()
plt.show()