In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import sys

In [2]:
# Define the file path
file_path = "KBZ_Pay_Sentiment_Cate_Ver3.csv"

# 1. LOAD DATA
try:
    df = pd.read_csv(file_path, encoding='utf-8-sig', on_bad_lines='skip', engine='python')
except Exception as e:
    print(f"Error loading file: {e}")
    sys.exit()

In [11]:
# Drop rows with missing values
df = df.dropna(subset=['Sentiment_Category', 'Date_Interval', 'Translated_Review'])

In [14]:
# 2. FEATURE ENGINEERING: Create the Review_Length feature
# This step is crucial to incorporate text content without causing target leakage.
if 'Translated_Review' in df.columns:
    # Robust, vectorized method for word count
    df['Review_Length'] = df['Translated_Review'].astype(str).str.split().str.len().fillna(0).astype(int)
else:
    print("ERROR: 'Translated_Review' column not found. Cannot proceed with analysis.")
    sys.exit()

In [16]:
# Final Feature List (X): Excludes redundant (Device_Phone) and leaky (Sentiment_Score_Translated) columns.
feature_cols = ['Date_Interval', 'Emoji_Count', 'Lang_Burmese', 'Lang_Mixed', 'Lang_English', 'Device_Tablet', 'Review_Length']
X = df[feature_cols]
# Dependent Variable (Target) - Continuous Rating (1-5 stars)
y = df['Rating']

In [17]:
# 4. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# 5. TRAIN THE RANDOM FOREST REGRESSOR MODEL
# Used for continuous target variables (Rating)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [22]:
# 6. EVALUATE PREDICTIVE ACCURACY
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("\n" + "="*80)
print("RANDOM FOREST REGRESSOR - PREDICTIVE PERFORMANCE")
print("="*80)
print(f"R-squared (R2 Score): {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")


RANDOM FOREST REGRESSOR - PREDICTIVE PERFORMANCE
R-squared (R2 Score): 0.1962
Mean Squared Error (MSE): 1.3718


In [23]:
# 7. EXTRACT AND DISPLAY FEATURE IMPORTANCE (Key Thesis Output)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

In [24]:
print("\n" + "="*80)
print("RANDOM FOREST REGRESSOR - FINAL FEATURE IMPORTANCE SCORES")
print("(Predicting 1-5 Star Rating)")
print("="*80)
print(feature_importance_df.to_string(index=False, float_format="{:.4f}".format))


RANDOM FOREST REGRESSOR - FINAL FEATURE IMPORTANCE SCORES
(Predicting 1-5 Star Rating)
      Feature  Importance
Review_Length      0.6580
Date_Interval      0.1659
Device_Tablet      0.0635
 Lang_English      0.0389
   Lang_Mixed      0.0353
 Lang_Burmese      0.0281
  Emoji_Count      0.0104
