In [2]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix)
import warnings
warnings.filterwarnings("ignore")

In [4]:
# load config
with open("config.json") as f:
    config = json.load(f)

# load datasets
steam_path = config["steam_reviews_path"]
games_path = config["all_games_path"]

steam_df = pd.read_csv(steam_path)
games_df = pd.read_csv(games_path)

print("steam reviews sample:\n", steam_df.head())
print("all games sample:\n", games_df.head())

steam reviews sample:
    Unnamed: 0  app_id                  app_name  review_id  language  \
0           0  292030  The Witcher 3: Wild Hunt   85185598  schinese   
1           1  292030  The Witcher 3: Wild Hunt   85185250  schinese   
2           2  292030  The Witcher 3: Wild Hunt   85185111  schinese   
3           3  292030  The Witcher 3: Wild Hunt   85184605   english   
4           4  292030  The Witcher 3: Wild Hunt   85184287  schinese   

                                              review  timestamp_created  \
0                            不玩此生遗憾，RPG游戏里的天花板，太吸引人了         1611381629   
1                                 拔DIAO无情打桩机--杰洛特!!!         1611381030   
2                                              巫师3NB         1611380800   
3  One of the best RPG's of all time, worthy of a...         1611379970   
4                                                 大作         1611379427   

   timestamp_updated  recommended  votes_helpful  ...  steam_purchase  \
0         1611381629

In [None]:
# filter and clean reviews
print("filtering for english reviews and cleaning...")
steam_df = steam_df[
    (steam_df['language'] == 'english') &
    (steam_df['review'].notnull()) &
    (steam_df['review'].str.len() > 20)
]

steam_df['label'] = steam_df['recommended'].astype(int)
print(f"filtered reviews remaining: {len(steam_df)}")

🧼 filtering for english reviews and cleaning...
filtered reviews remaining: 6993649


In [7]:
# group by game
GROUPED_PATH = "grouped_reviews.csv"
if os.path.exists(GROUPED_PATH):
    print("loading previously grouped reviews from file...")
    grouped_reviews = pd.read_csv(GROUPED_PATH)
else:
    print("grouping reviews by game...")
    grouped_reviews = steam_df.groupby('app_name').agg({
        'review': lambda x: ' '.join(x),
        'label': 'mean'
    }).reset_index()

    grouped_reviews['sentiment_label'] = (grouped_reviews['label'] > 0.5).astype(int)

    print(f"games grouped: {len(grouped_reviews)}")
    print("sample grouped review:\n", grouped_reviews['review'].iloc[0][:500])

    grouped_reviews.to_csv(GROUPED_PATH, index=False)
    print("grouped reviews saved")

grouping reviews by game...
games grouped: 315
sample grouped review:
 muito legal e desafiador recomendo!! I was admittedly initially turned off by the art style to this game. After numerous recommendations I finally gave in a purchased it during the recent sale. I'm sad I waited so long because this game is so incredibly fun. I encourage anyone to try it out, and I look forward to 30XX releasing soon! It looks like a flash game, but this game is fun to play and looks great. I can't wait to find someone to play it with. The best Megaman X game I have seen in years
grouped reviews saved


In [9]:
# show sentiment ratio
num_pos = (grouped_reviews['label'] > 0.5).sum()
num_neg = (grouped_reviews['label'] <= 0.5).sum()
print(f"\nratio split of grouped games (total = {len(grouped_reviews)}):")
print(f"positive (label > 0.5): {num_pos}")
print(f"negative (label <= 0.5): {num_neg}")


ratio split of grouped games (total = 315):
positive (label > 0.5): 287
negative (label <= 0.5): 28


In [10]:
# text vectorization
X_PATH = "X_vectorized.pkl"
y_PATH = "y_labels.pkl"
VECTORIZER_PATH = "tfidf_vectorizer.pkl"

if os.path.exists(X_PATH) and os.path.exists(y_PATH) and os.path.exists(VECTORIZER_PATH):
    print("loading cached vectorized data...")
    X = load(X_PATH)
    y = load(y_PATH)
    vectorizer = load(VECTORIZER_PATH)
else:
    print("vectorizing text using tfidf...")
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    X = vectorizer.fit_transform(grouped_reviews['review'])
    y = grouped_reviews['sentiment_label']
    dump(X, X_PATH)
    dump(y, y_PATH)
    dump(vectorizer, VECTORIZER_PATH)
    print("vectorized data saved")

print(f"tfidf matrix shape: {X.shape}")

vectorizing text using tfidf...
vectorized data saved
tfidf matrix shape: (315, 10000)


In [12]:
# split dataset
print("splitting into train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

splitting into train/test...


In [13]:
# evaluation helper
def evaluate_model(name, model):
    print(f"\ntraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    print(f"{name} Accuracy: {accuracy_score(y_test, preds):.2f}")
    print(f"{name} Precision: {precision_score(y_test, preds, zero_division=0):.2f}")
    print(f"{name} Recall: {recall_score(y_test, preds, zero_division=0):.2f}")
    print(f"{name} F1 Score: {f1_score(y_test, preds, zero_division=0):.2f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))

In [14]:
# run models
evaluate_model("Naive Bayes", MultinomialNB())
evaluate_model("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
evaluate_model("SVM (RBF)", SVC(kernel='rbf', class_weight='balanced'))


training Naive Bayes...
Naive Bayes Accuracy: 0.90
Naive Bayes Precision: 0.90
Naive Bayes Recall: 1.00
Naive Bayes F1 Score: 0.95
Confusion Matrix:
 [[ 0  6]
 [ 0 57]]

training Random Forest...
Random Forest Accuracy: 0.92
Random Forest Precision: 0.92
Random Forest Recall: 1.00
Random Forest F1 Score: 0.96
Confusion Matrix:
 [[ 1  5]
 [ 0 57]]

training SVM (RBF)...
SVM (RBF) Accuracy: 0.94
SVM (RBF) Precision: 0.96
SVM (RBF) Recall: 0.96
SVM (RBF) F1 Score: 0.96
Confusion Matrix:
 [[ 4  2]
 [ 2 55]]


In [15]:
# most common words
print("\n📊 Top 20 most common words in all reviews:")
feature_names = vectorizer.get_feature_names_out()
sum_words = X.sum(axis=0)
word_freq = [(word, sum_words[0, idx]) for idx, word in enumerate(feature_names)]
top_words = sorted(word_freq, key=lambda x: x[1], reverse=True)[:20]

for word, freq in top_words:
    print(f"{word}: {int(freq)}")


📊 Top 20 most common words in all reviews:
game: 214
like: 48
just: 39
good: 36
fun: 36
play: 35
really: 29
great: 28
games: 27
time: 26
10: 25
story: 21
don: 17
played: 16
best: 16
love: 15
hours: 15
playing: 15
ve: 14
make: 14
