#### LIBRARIES USED

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.models import Word2Vec

import re

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
import spacy

import random

  from pandas.core.computation.check import NUMEXPR_INSTALLED


#### 1. DATA SELECTION

In [2]:
original_df = pd.read_csv('dataset.csv')

In [3]:
chosen_games = [
    #RPG
    "Dota 2",
    "The Elder Scrolls V: Skyrim",
    "The Witcher 3: Wild Hunt",

    #FPS
    "Call of Duty: Modern Warfare 3",
    "Counter-Strike",
    "DOOM",

    #Sports
    "NBA 2K16",
    "Rocket League",
    "Football Manager 2016"
    ]

df_filtered_games = original_df[original_df['app_name'].isin(chosen_games)]

In [4]:
df_filtered_games.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


In [5]:
df_filtered_games.to_csv('filtered_dataset.csv',index=False)

#### 2. DATA PRE-PROCESSING

In [None]:
df_filtered_games.info()

In [None]:
df_filtered_games = df_filtered_games.dropna(subset=['review_text'])

df_filtered_games.info()

In [None]:
# Download stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    
    tokens = word_tokenize(text)  # Tokenize text
    
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize each word

    return ' '.join(tokens)

# Apply the function to the review_text column
df_filtered_games['cleaned_review_text'] = df_filtered_games['review_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

# 13 minutes to run


##### Checking preprocessed dataset

In [None]:
df_filtered_games[['review_text', 'cleaned_review_text']].head()

In [None]:
df_filtered_games.info()

In [None]:
# Check which rows have missing values (NaN) in the 'cleaned_review_text' column
missing_rows = df_filtered_games[df_filtered_games['cleaned_review_text'].isnull()]

# Display the rows with missing values
print(missing_rows)

In [None]:
df_filtered_games['cleaned_review_text'].replace('',np.nan,inplace=True)

In [None]:
df_filtered_games.info()

In [None]:
df_filtered_games = df_filtered_games.dropna(subset=['cleaned_review_text'])

In [None]:
df_filtered_games.info()

In [None]:
df_filtered_games.to_csv('pre_processed_reviews.csv',index=False)

##### 3. SENTIMENT ANALYSIS

In [20]:
df = pd.read_csv('pre_processed_reviews.csv')

df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,cleaned_review_text
0,10,Counter-Strike,Ruined my life.,1,0,ruined life
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,experience game type review saying thing like ...
2,10,Counter-Strike,This game saved my virginity.,1,0,game saved virginity
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,like original game like game dont lag like gam...
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,easy learn hard master


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200242 entries, 0 to 200241
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   app_id               200242 non-null  int64 
 1   app_name             200242 non-null  object
 2   review_text          200242 non-null  object
 3   review_score         200242 non-null  int64 
 4   review_votes         200242 non-null  int64 
 5   cleaned_review_text  200242 non-null  object
dtypes: int64(3), object(3)
memory usage: 9.2+ MB


In [22]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

#Sentiment Analysis
sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['review_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# 1 minute 30 seconds to run

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kevanteo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [23]:
df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,cleaned_review_text,sentiment
0,10,Counter-Strike,Ruined my life.,1,0,ruined life,-0.4767
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,experience game type review saying thing like ...,0.9954
2,10,Counter-Strike,This game saved my virginity.,1,0,game saved virginity,0.4215
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,like original game like game dont lag like gam...,0.9098
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,easy learn hard master,0.3612


##### Checking for Contradicting Reviews
- These will require human intervention/expertise

In [24]:
# Define a function to detect contradicting reviews
def detect_contradicting_reviews(row):
    # Check for rating-text mismatch
    if row['review_score'] == 1 and row['sentiment'] < 0:
        return 1
    elif row['review_score'] == -1 and row['sentiment'] > 0:
        return 1
    else:
        return 0

# Apply the function to the DataFrame
df['contradicting'] = df.apply(detect_contradicting_reviews, axis=1)

df[['cleaned_review_text', 'review_score', 'sentiment', 'contradicting']].head()

Unnamed: 0,cleaned_review_text,review_score,sentiment,contradicting
0,ruined life,1,-0.4767,1
1,experience game type review saying thing like ...,1,0.9954,0
2,game saved virginity,1,0.4215,0
3,like original game like game dont lag like gam...,1,0.9098,0
4,easy learn hard master,1,0.3612,0


In [25]:
# Step 1: Create a new DataFrame with contradicting reviews
contradicting_review = df[df['contradicting'] == 1]

# Step 2: Drop the contradicting reviews from the original DataFrame
df = df[df['contradicting'] == 0]

In [27]:
contradicting_review.to_csv('contradicting_reviews_2.csv',index=False)

In [None]:
print(df['contradicting'].value_counts())

print(df['review_score'].value_counts())

In [None]:
df.head()

In [None]:
df.info()

#### TF-IDF Vectorising

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import scipy.sparse

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned_review_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_review_text'].dropna())

# Convert the TF-IDF matrix to a DataFrame (using sparse matrix format)
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df.head()

In [None]:
nonzero_count = tfidf_matrix.nnz
total_elements = tfidf_matrix.shape[0] * tfidf_matrix.shape[1]
print("Non-zero entries:", nonzero_count)
print("Total elements:", total_elements)

In [None]:
# Print the nonzero tokens for the first review
nonzero_tokens = tfidf_df.iloc[0][tfidf_df.iloc[0] != 0]
print(nonzero_tokens)

In [None]:
nonzero_per_row = tfidf_matrix.getnnz(axis=1)
print("Nonzero counts for each row:", nonzero_per_row)

In [None]:
# df = df.drop(['app_id','review_votes','app_name','sentiment','contradicting','review_text'],axis=1)

In [None]:
# Define features and target
X = tfidf_matrix          # TF-IDF features (sparse matrix)
y = df['review_score']  # Target variable: 1 (positive/recommend) or -1 (negative/not recommend)

In [None]:
# --- Model Training ---
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize and train the Logistic Regression classifier
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
#lr.fit(X_train_resampled, y_train_resampled)

acc_scores = cross_val_score(lr, X, y, cv=cv, scoring='accuracy')
f1_scores = cross_val_score(lr, X, y, cv=cv, scoring='f1_macro')

print("Cross-validated Accuracy:", acc_scores)
print("Mean Accuracy:", acc_scores.mean())
print("Cross-validated F1 (macro):", f1_scores)
print("Mean F1 (macro):", f1_scores.mean())

In [None]:
from sklearn.svm import LinearSVC

svc_model = LinearSVC(class_weight='balanced', max_iter=2000, random_state=42)

acc_scores_svc = cross_val_score(svc_model, X, y, cv=cv, scoring='accuracy')
f1_scores_svc = cross_val_score(svc_model, X, y, cv=cv, scoring='f1_macro')

print("SVC Accuracy:", acc_scores_svc)
print("SVC Mean Accuracy:", acc_scores_svc.mean())
print("SVC F1 (macro):", f1_scores_svc)
print("SVC Mean F1 (macro):", f1_scores_svc.mean())

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

# Step 1: Dimensionality reduction to reduce TF-IDF size 
svd = TruncatedSVD(n_components=300, random_state=42)
X_reduced = svd.fit_transform(X)
# Step 2: Stratified K-Fold setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Optimized Random Forest
rf_model = RandomForestClassifier(
    n_estimators=30,             
    class_weight='balanced',    
    max_depth=20,              
    max_features='sqrt',  
    random_state=42,
    n_jobs=-1       
)

# Step 4: Cross-validated performance
acc_scores_rf = cross_val_score(rf_model, X_reduced, y, cv=cv, scoring='accuracy', n_jobs=-1)
f1_scores_rf = cross_val_score(rf_model, X_reduced, y, cv=cv, scoring='f1_macro', n_jobs=-1)

# Step 5: Output
print("Random Forest Accuracy:", acc_scores_rf)
print("Random Forest Mean Accuracy:", acc_scores_rf.mean())
print("Random Forest F1 (macro):", f1_scores_rf)
print("Random Forest Mean F1 (macro):", f1_scores_rf.mean())


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Step 2: Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Fast Gradient Boosting Model
gb_model = HistGradientBoostingClassifier(
    max_iter=100, 
    max_depth=6,    
    random_state=42,
    class_weight='balanced'
)

# Step 4: Cross-validated evaluation
acc_scores_gb = cross_val_score(gb_model, X_reduced, y, cv=cv, scoring='accuracy')
f1_scores_gb = cross_val_score(gb_model, X_reduced, y, cv=cv, scoring='f1_macro')

# Step 5: Output
print("Gradient Boosting Accuracy:", acc_scores_gb)
print("Mean Accuracy:", acc_scores_gb.mean())
print("Gradient Boosting F1 (macro):", f1_scores_gb)
print("Mean F1 (macro):", f1_scores_gb.mean())


In [None]:
from imblearn.over_sampling import SMOTE
# --- Train-Test Split ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Oversampling using SMOTE ---

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize and train the Logistic Regression classifier
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_resampled, y_train_resampled)

# --- Model Evaluation ---
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = lr.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.svm import SVC

# Initialize and train the SVM classifier
svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_svm = svm.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

In [None]:
import joblib

# Export the SVM model to a file
joblib.dump(svm, 'svm_model.pkl')

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting classifier
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_gb = gb.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

In [None]:
from sklearn.metrics import f1_score

# 1. Threshold VADER compound score into binary labels
# Standard VADER thresholding
# > 0.05 → positive (1), < -0.05 → negative (0), else → neutral (you can treat as positive or drop)
df_vader = df.copy()
df_vader['vader_pred'] = (df_vader['sentiment'] > 0.05).astype(int)

# 2. Ground truth
y_true = df_vader['review_score'].astype(int)
y_pred_vader = df_vader['vader_pred'].astype(int)

# 3. Metrics
accuracy_vader = accuracy_score(y_true, y_pred_vader)
f1_vader = f1_score(y_true, y_pred_vader, average='macro')
report_vader = classification_report(y_true, y_pred_vader, target_names=['Not Recommend', 'Recommend'])
conf_matrix_vader = confusion_matrix(y_true, y_pred_vader)

print("VADER Accuracy:", accuracy_vader)
print("VADER Macro F1 Score:", f1_vader)
print("\nVADER Classification Report:")
print(report_vader)
print("VADER Confusion Matrix:")
print(conf_matrix_vader)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_review_text"], padding="max_length", truncation=True, max_length=128)

df_clean['review_score'] = df_clean['review_score'].apply(lambda x: 1 if x == 1 else 0)

In [None]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available

In [None]:
# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df_clean)
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["tokens"])
dataset = dataset.rename_column("review_score", "labels")
dataset.set_format("torch")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train model
trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:
# Save model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")
print("Model saved successfully.")

In [None]:
import torch
import accelerate
import transformers
print(torch.__version__)
print(accelerate.__version__)
print(transformers.__version__)

In [None]:
# import string
# from nltk.corpus import stopwords, wordnet
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# import nltk

# # Download required resources
# nltk.download('stopwords')
# nltk.download('punkt')

# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))  # Load stopwords once

# # Define a function to preprocess text
# def preprocess_text(text):
#     text = text.lower()  # Convert to lowercase
#     text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
#     tokens = word_tokenize(text)  # Tokenize text
#     tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
#     # Get POS tags and lemmatize accordingly
#     pos_tags = pos_tag(tokens)
#     tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

#     return ' '.join(tokens)

# # Apply the function to the review_text column
# df['cleaned_review_text'] = df['review_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

# print(df[['review_text', 'cleaned_review_text']].head())
