In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('preprocessed_reviews.csv')

# Get the number of reviews for each product
product_review_count = df['app_name'].value_counts()


print(product_review_count)


app_name
Dota 2                            73541
Rocket League                     54227
The Witcher 3: Wild Hunt          31853
DOOM                              20682
Counter-Strike                    12353
Call of Duty: Modern Warfare 3     5197
NBA 2K16                           2534
The Elder Scrolls V: Skyrim        2260
Football Manager 2016              1742
Name: count, dtype: int64


In [2]:
""" import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Define a function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply the function to the review_text column
df['cleaned_review_text'] = df['review_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

print(df[['review_text', 'cleaned_review_text']].head()) """

" import string\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\nimport nltk\n\n# Download stopwords if not already downloaded\nnltk.download('stopwords')\nnltk.download('punkt')\n\n# Define a function to preprocess text\ndef preprocess_text(text):\n    # Convert to lowercase\n    text = text.lower()\n    # Remove punctuation\n    text = text.translate(str.maketrans('', '', string.punctuation))\n    # Tokenize text\n    tokens = word_tokenize(text)\n    # Remove stopwords\n    tokens = [word for word in tokens if word not in stopwords.words('english')]\n    return ' '.join(tokens)\n\n# Apply the function to the review_text column\ndf['cleaned_review_text'] = df['review_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)\n\nprint(df[['review_text', 'cleaned_review_text']].head()) "

In [3]:
# Define a function to detect sarcasm
def detect_sarcasm(row):
    # Check for rating-text mismatch
    if row['review_score'] == 1 and row['sentiment'] < 0:
        return 1
    else:
        return 0

# Apply the function to the DataFrame
df['sarcasm'] = df.apply(detect_sarcasm, axis=1)

print(df[['cleaned_review_text', 'review_score', 'sentiment', 'sarcasm']].head())



                                 cleaned_review_text  review_score  sentiment  \
0                                        ruined life             1    -0.4767   
1  experience game type review saying things like...             1     0.9961   
2                               game saved virginity             1     0.4215   
3  like original games like games dont lag like g...             1     0.8817   
4                             easy learn hard master             1     0.3612   

   sarcasm  
0        1  
1        0  
2        0  
3        0  
4        0  


In [4]:
print(df['sarcasm'].value_counts())


sarcasm
0    182283
1     22106
Name: count, dtype: int64


In [5]:
print(df['cleaned_review_text'].head(10))


0                                          ruined life
1    experience game type review saying things like...
2                                 game saved virginity
3    like original games like games dont lag like g...
4                               easy learn hard master
5                                      r revolver play
6                        still better call duty ghosts
7    cant buy skins cases keys stickers gaben cant ...
8    counterstrike ok years unlimited fun friends f...
9    every server spanish french fluently swear lan...
Name: cleaned_review_text, dtype: object


In [6]:
print(df['review_votes'].value_counts())

review_votes
0    173422
1     30967
Name: count, dtype: int64


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import scipy.sparse

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned_review_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_review_text'].dropna())

# Convert the TF-IDF matrix to a DataFrame (using sparse matrix format)
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df.head())

   aa  aaa  aaaa  aaaaa  aaaaaaaaaa  aaaaaaaaaaa  \
0   0    0     0      0           0            0   
1   0    0     0      0           0            0   
2   0    0     0      0           0            0   
3   0    0     0      0           0            0   
4   0    0     0      0           0            0   

   aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  \
0                                 0   
1                                 0   
2                                 0   
3                                 0   
4                                 0   

   aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  \
0                                                  0                
1                                                  0                
2                                                  0                
3                                                  0                
4                                                  0                

   aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

In [8]:
nonzero_count = tfidf_matrix.nnz
total_elements = tfidf_matrix.shape[0] * tfidf_matrix.shape[1]
print("Non-zero entries:", nonzero_count)
print("Total elements:", total_elements)


Non-zero entries: 3714730
Total elements: 18501935652


In [9]:
# Print the nonzero tokens for the first review
nonzero_tokens = tfidf_df.iloc[0][tfidf_df.iloc[0] != 0]
print(nonzero_tokens)


life      0.570045
ruined    0.821614
Name: 0, dtype: Sparse[float64, 0]


In [10]:
nonzero_per_row = tfidf_matrix.getnnz(axis=1)
print("Nonzero counts for each row:", nonzero_per_row)


Nonzero counts for each row: [  2 140   3 ...   3 124  22]


In [11]:
# --- Data Preparation ---
# Filter the DataFrame to include only rows with a cleaned review text.
# This ensures that the TF-IDF features align with your target variable.
df_clean = df.dropna(subset=['cleaned_review_text'])

# IMPORTANT:
# The tfidf_matrix should have been computed on df['cleaned_review_text'].dropna()
# so its rows correspond to df_clean.
# If that’s not the case, re-run the vectorization on df_clean, e.g.:
# tfidf_matrix = tfidf_vectorizer.fit_transform(df_clean['cleaned_review_text'])

# Define features and target
X = tfidf_matrix          # TF-IDF features (sparse matrix)
y = df_clean['review_score']  # Target variable: 1 (positive/recommend) or -1 (negative/not recommend)

# --- Train-Test Split ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Oversampling using SMOTE ---
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [12]:
# --- Model Training ---
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression classifier
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_resampled, y_train_resampled)

# --- Model Evaluation ---
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = lr.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8552353119016427

Classification Report:
              precision    recall  f1-score   support

          -1       0.41      0.84      0.55      4161
           1       0.98      0.86      0.91     35531

    accuracy                           0.86     39692
   macro avg       0.69      0.85      0.73     39692
weighted avg       0.92      0.86      0.88     39692


Confusion Matrix:
[[ 3503   658]
 [ 5088 30443]]


In [18]:
# Check the shape of tfidf_matrix
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

# Check the number of rows in df_clean
print("Number of rows in df_clean:", df_clean.shape[0])

# Verify if they match
if tfidf_matrix.shape[0] == df_clean.shape[0]:
    print("tfidf_matrix was computed on df_clean['cleaned_review_text']")
else:
    print("tfidf_matrix was NOT computed on df_clean['cleaned_review_text']")

Shape of tfidf_matrix: (198459, 93228)
Number of rows in df_clean: 198459
tfidf_matrix was computed on df_clean['cleaned_review_text']


In [21]:
from sklearn.svm import SVC

# Initialize and train the SVM classifier
svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_svm = svm.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

Accuracy: 0.853774060264033

Classification Report:
              precision    recall  f1-score   support

          -1       0.40      0.83      0.54      4161
           1       0.98      0.86      0.91     35531

    accuracy                           0.85     39692
   macro avg       0.69      0.84      0.73     39692
weighted avg       0.92      0.85      0.87     39692


Confusion Matrix:
[[ 3438   723]
 [ 5081 30450]]


In [22]:
import joblib

# Export the SVM model to a file
joblib.dump(svm, 'svm_model.pkl')

['svm_model.pkl']

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy: 0.9312707850448453

Classification Report:
              precision    recall  f1-score   support

          -1       0.67      0.68      0.67      4161
           1       0.96      0.96      0.96     35531

    accuracy                           0.93     39692
   macro avg       0.82      0.82      0.82     39692
weighted avg       0.93      0.93      0.93     39692


Confusion Matrix:
[[ 2810  1351]
 [ 1377 34154]]


In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting classifier
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_gb = gb.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

Accuracy: 0.7565504383754913

Classification Report:
              precision    recall  f1-score   support

          -1       0.27      0.77      0.40      4161
           1       0.97      0.75      0.85     35531

    accuracy                           0.76     39692
   macro avg       0.62      0.76      0.62     39692
weighted avg       0.89      0.76      0.80     39692


Confusion Matrix:
[[ 3220   941]
 [ 8722 26809]]


In [15]:
import joblib

# Export the Random Forest model to a file
joblib.dump(rf, 'random_forest_model.pkl')

# Export the Gradient Boosting model to a file
joblib.dump(gb, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']