In [None]:
# 1. import dataset
import pandas as pd
file_path = "../datasets/amazon_user_reviews_regular_ML_simplified.parquet"
df = pd.read_parquet(file_path)

df.info()

In [None]:
# based on rsult of feature selection:
## keep one from word_count, char_count, total_punctuatuation, (*verified_purchase)
## keep one from season and month

# choose word_count, verified_purchase, month
features = ['exclamation_count', 'question_count', 'word_count',
       'all_caps_words', 'uppercase_ratio',
       'avg_word_length', 'helpful_vote', 'int_verified_purchase', 'hour',
       'month']

In [None]:
# Random Forest deployment
import numpy as np
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

y = df["rating"]
X = df[features]

# 1. data preprocessing
# downsampling
print(f"Original Class Distribution: {Counter(y)}")
rus = RandomUnderSampler(sampling_strategy='majority')
X_resampled_down, y_resampled_down = rus.fit_resample(X, y)
print(f"After Upsampling: {Counter(y_resampled_down)}")

# Standarlization
scaler = StandardScaler()
scaler.fit(X_resampled_down)
X_scaled = scaler.transform(X_resampled_down)

# PCA
pca = PCA(n_components="mle")
X_scaled_pca = pca.fit_transform(X_scaled)

# data split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_pca, y_resampled_down, test_size=0.3, random_state=42)

In [None]:
# 2. model deployment
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# 3. predict
y_pred = rf_classifier.predict(X_test)

# 4. evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("=" * 50)
print("MODEL EVALUATION METRICS")
print("=" * 50)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("\n" + "=" * 50)
print("CLASSIFICATION REPORT")
print("=" * 50)
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True)
plt.title('Confusion Matrix - Random Forest Classification')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()