#### LIBRARIES USED

In [6]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer


import gensim
from gensim.models import Word2Vec

import re

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
import spacy

import random

#### 1. DATA SELECTION

In [2]:
original_df = pd.read_csv('dataset.csv')

In [3]:
chosen_games = [
    #RPG
    "Dota 2",
    "The Elder Scrolls V: Skyrim",
    "The Witcher 3: Wild Hunt",

    #FPS
    "Call of Duty: Modern Warfare 3",
    "Counter-Strike",
    "DOOM",

    #Sports
    "NBA 2K16",
    "Rocket League",
    "Football Manager 2016"
    ]

df_filtered_games = original_df[original_df['app_name'].isin(chosen_games)]

In [4]:
df_filtered_games.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


#### 2. DATA PRE-PROCESSING

In [5]:
df_filtered_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 204389 entries, 0 to 6257360
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   app_id        204389 non-null  int64 
 1   app_name      204389 non-null  object
 2   review_text   204127 non-null  object
 3   review_score  204389 non-null  int64 
 4   review_votes  204389 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 9.4+ MB


In [7]:
df_filtered_games = df_filtered_games.dropna(subset=['review_text'])

df_filtered_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 204127 entries, 0 to 6257360
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   app_id        204127 non-null  int64 
 1   app_name      204127 non-null  object
 2   review_text   204127 non-null  object
 3   review_score  204127 non-null  int64 
 4   review_votes  204127 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 9.3+ MB


In [None]:
# Download stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    
    tokens = word_tokenize(text)  # Tokenize text
    
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize each word

    return ' '.join(tokens)

# Apply the function to the review_text column
df_filtered_games['cleaned_review_text'] = df_filtered_games['review_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

# 13 minutes to run


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevanteo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/kevanteo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### Checking preprocessed dataset

In [21]:
df_filtered_games[['review_text', 'cleaned_review_text']].head()

Unnamed: 0,review_text,cleaned_review_text
0,Ruined my life.,ruined life
1,This will be more of a ''my experience with th...,experience game type review saying thing like ...
2,This game saved my virginity.,game saved virginity
3,• Do you like original games? • Do you like ga...,like original game like game dont lag like gam...
4,"Easy to learn, hard to master.",easy learn hard master


In [71]:
df_filtered_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 204127 entries, 0 to 6257360
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   app_id               204127 non-null  int64 
 1   app_name             204127 non-null  object
 2   review_text          204127 non-null  object
 3   review_score         204127 non-null  int64 
 4   review_votes         204127 non-null  int64 
 5   cleaned_review_text  204127 non-null  object
dtypes: int64(3), object(3)
memory usage: 10.9+ MB


In [None]:
# Check which rows have missing values (NaN) in the 'cleaned_review_text' column
missing_rows = df_filtered_games[df_filtered_games['cleaned_review_text'].isnull()]

# Display the rows with missing values
print(missing_rows)

In [76]:
df_filtered_games['cleaned_review_text'].replace('',np.nan,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered_games['cleaned_review_text'].replace('',np.nan,inplace=True)


In [77]:
df_filtered_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 204127 entries, 0 to 6257360
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   app_id               204127 non-null  int64 
 1   app_name             204127 non-null  object
 2   review_text          204127 non-null  object
 3   review_score         204127 non-null  int64 
 4   review_votes         204127 non-null  int64 
 5   cleaned_review_text  200242 non-null  object
dtypes: int64(3), object(3)
memory usage: 10.9+ MB


In [78]:
df_filtered_games = df_filtered_games.dropna(subset=['cleaned_review_text'])

In [80]:
df_filtered_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200242 entries, 0 to 6257360
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   app_id               200242 non-null  int64 
 1   app_name             200242 non-null  object
 2   review_text          200242 non-null  object
 3   review_score         200242 non-null  int64 
 4   review_votes         200242 non-null  int64 
 5   cleaned_review_text  200242 non-null  object
dtypes: int64(3), object(3)
memory usage: 10.7+ MB


In [81]:
df_filtered_games.to_csv('pre_processed_reviews.csv',index=False)

##### 3. SENTIMENT ANALYSIS

In [82]:
df = pd.read_csv('pre_processed_reviews.csv')

df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,cleaned_review_text
0,10,Counter-Strike,Ruined my life.,1,0,ruined life
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,experience game type review saying thing like ...
2,10,Counter-Strike,This game saved my virginity.,1,0,game saved virginity
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,like original game like game dont lag like gam...
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,easy learn hard master


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200242 entries, 0 to 200241
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   app_id               200242 non-null  int64 
 1   app_name             200242 non-null  object
 2   review_text          200242 non-null  object
 3   review_score         200242 non-null  int64 
 4   review_votes         200242 non-null  int64 
 5   cleaned_review_text  200242 non-null  object
dtypes: int64(3), object(3)
memory usage: 9.2+ MB


In [84]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

#Sentiment Analysis
sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['review_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# 1 minute 30 seconds to run

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kevanteo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [85]:
df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,cleaned_review_text,sentiment
0,10,Counter-Strike,Ruined my life.,1,0,ruined life,-0.4767
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,experience game type review saying thing like ...,0.9954
2,10,Counter-Strike,This game saved my virginity.,1,0,game saved virginity,0.4215
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,like original game like game dont lag like gam...,0.9098
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,easy learn hard master,0.3612


##### Checking for Contradicting Reviews
- These will require human intervention/expertise

In [86]:
# Define a function to detect contradicting reviews
def detect_contradicting_reviews(row):
    # Check for rating-text mismatch
    if row['review_score'] == 1 and row['sentiment'] < 0:
        return 1
    else:
        return 0

# Apply the function to the DataFrame
df['contradicting'] = df.apply(detect_contradicting_reviews, axis=1)

df[['cleaned_review_text', 'review_score', 'sentiment', 'contradicting']].head()

Unnamed: 0,cleaned_review_text,review_score,sentiment,contradicting
0,ruined life,1,-0.4767,1
1,experience game type review saying thing like ...,1,0.9954,0
2,game saved virginity,1,0.4215,0
3,like original game like game dont lag like gam...,1,0.9098,0
4,easy learn hard master,1,0.3612,0


In [87]:
# Step 1: Create a new DataFrame with contradicting reviews
contradicting_review = df[df['contradicting'] == 1]

# Step 2: Drop the contradicting reviews from the original DataFrame
df = df[df['contradicting'] == 0]

In [88]:
contradicting_review.to_csv('contradicting_reviews.csv',index=False)

In [89]:
print(df['contradicting'].value_counts())

print(df['review_score'].value_counts())

contradicting
0    178459
Name: count, dtype: int64
review_score
 1    157578
-1     20881
Name: count, dtype: int64


In [90]:
df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,cleaned_review_text,sentiment,contradicting
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,experience game type review saying thing like ...,0.9954,0
2,10,Counter-Strike,This game saved my virginity.,1,0,game saved virginity,0.4215,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,like original game like game dont lag like gam...,0.9098,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,easy learn hard master,0.3612,0
5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1,r8 revolver 1010 play,0.0516,0


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178459 entries, 1 to 200241
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   app_id               178459 non-null  int64  
 1   app_name             178459 non-null  object 
 2   review_text          178459 non-null  object 
 3   review_score         178459 non-null  int64  
 4   review_votes         178459 non-null  int64  
 5   cleaned_review_text  178459 non-null  object 
 6   sentiment            178459 non-null  float64
 7   contradicting        178459 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 12.3+ MB


#### TF-IDF Vectorising

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import scipy.sparse

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned_review_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_review_text'].dropna())

# Convert the TF-IDF matrix to a DataFrame (using sparse matrix format)
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df.head()

Unnamed: 0,00,000,0000,00000,000000,0000000,00000000,0000000000,00000000000,0000000000000,...,zynga,zyzz,zz,zzz,zzzz,zzzzquiet,zzzzz,zzzzzz,zzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
nonzero_count = tfidf_matrix.nnz
total_elements = tfidf_matrix.shape[0] * tfidf_matrix.shape[1]
print("Non-zero entries:", nonzero_count)
print("Total elements:", total_elements)

Non-zero entries: 3262885
Total elements: 15296613185


In [94]:
# Print the nonzero tokens for the first review
nonzero_tokens = tfidf_df.iloc[0][tfidf_df.iloc[0] != 0]
print(nonzero_tokens)

2002        0.089796
2008        0.085228
account     0.063544
advanced    0.069229
ago         0.058626
              ...   
way         0.117726
week        0.057115
wish        0.053154
wouldnt     0.117854
year         0.03931
Name: 0, Length: 136, dtype: Sparse[float64, 0]


In [95]:
nonzero_per_row = tfidf_matrix.getnnz(axis=1)
print("Nonzero counts for each row:", nonzero_per_row)

Nonzero counts for each row: [136   3  13 ...   3 123  23]


In [None]:
# df = df.drop(['app_id','review_votes','app_name','sentiment','contradicting','review_text'],axis=1)

In [None]:
# IMPORTANT:
# The tfidf_matrix should have been computed on df['cleaned_review_text'].dropna()
# so its rows correspond to df_clean.
# If that’s not the case, re-run the vectorization on df_clean, e.g.:
# tfidf_matrix = tfidf_vectorizer.fit_transform(df_clean['cleaned_review_text'])

# Define features and target
X = tfidf_matrix          # TF-IDF features (sparse matrix)
y = df['review_score']  # Target variable: 1 (positive/recommend) or -1 (negative/not recommend)

# --- Train-Test Split ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Oversampling using SMOTE ---
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# --- Model Training ---
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression classifier
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_resampled, y_train_resampled)

# --- Model Evaluation ---
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = lr.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
# Check the shape of tfidf_matrix
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

# Check the number of rows in df_clean
print("Number of rows in df_clean:", df_clean.shape[0])

# Verify if they match
if tfidf_matrix.shape[0] == df_clean.shape[0]:
    print("tfidf_matrix was computed on df_clean['cleaned_review_text']")
else:
    print("tfidf_matrix was NOT computed on df_clean['cleaned_review_text']")

In [None]:
from sklearn.svm import SVC

# Initialize and train the SVM classifier
svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_svm = svm.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

In [22]:
import joblib

# Export the SVM model to a file
joblib.dump(svm, 'svm_model.pkl')

['svm_model.pkl']

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting classifier
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_gb = gb.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

In [15]:
import joblib

# Export the Random Forest model to a file
joblib.dump(rf, 'random_forest_model.pkl')

# Export the Gradient Boosting model to a file
joblib.dump(gb, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']

In [7]:
df_clean.head()

Unnamed: 0,app_id,app_name,review_score,tokens,sentiment,cleaned_review_text,sarcasm
1,10,Counter-Strike,1,"['experience', 'game', 'type', 'review', 'sayi...",0.9961,experience game type review saying things like...,0
2,10,Counter-Strike,1,"['game', 'saved', 'virginity']",0.4215,game saved virginity,0
3,10,Counter-Strike,1,"['like', 'original', 'games', 'like', 'games',...",0.8817,like original games like games dont lag like g...,0
4,10,Counter-Strike,1,"['easy', 'learn', 'hard', 'master']",0.3612,easy learn hard master,0
5,10,Counter-Strike,1,"['r', 'revolver', 'play']",0.0516,r revolver play,0


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_review_text"], padding="max_length", truncation=True, max_length=128)

df_clean['review_score'] = df_clean['review_score'].apply(lambda x: 1 if x == 1 else 0)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available

True


In [None]:
# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df_clean)
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["tokens"])
dataset = dataset.rename_column("review_score", "labels")
dataset.set_format("torch")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train model
trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Map: 100%|██████████| 176357/176357 [01:09<00:00, 2543.02 examples/s]
  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.368,0.369587
2,0.3064,0.32551
3,0.2419,0.266819


Evaluation results: {'eval_loss': 0.2668187618255615, 'eval_runtime': 58.7291, 'eval_samples_per_second': 600.589, 'eval_steps_per_second': 75.074, 'epoch': 3.0}


ValueError: too many values to unpack (expected 2)

: 

In [2]:
# Save model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")
print("Model saved successfully.")

NameError: name 'model' is not defined

In [1]:
import torch
import accelerate
import transformers
print(torch.__version__)
print(accelerate.__version__)
print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


2.4.0+cpu
0.34.2
4.49.0


In [None]:
# import string
# from nltk.corpus import stopwords, wordnet
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# import nltk

# # Download required resources
# nltk.download('stopwords')
# nltk.download('punkt')

# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))  # Load stopwords once

# # Define a function to preprocess text
# def preprocess_text(text):
#     text = text.lower()  # Convert to lowercase
#     text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
#     tokens = word_tokenize(text)  # Tokenize text
#     tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
#     # Get POS tags and lemmatize accordingly
#     pos_tags = pos_tag(tokens)
#     tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

#     return ' '.join(tokens)

# # Apply the function to the review_text column
# df['cleaned_review_text'] = df['review_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

# print(df[['review_text', 'cleaned_review_text']].head())
