In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

df = pd.read_parquet("../../datasets/amazon_user_reviews_features_sentiment.parquet")
feature_cols = [
    "exclamation_count", "question_count", "word_count", "char_count",
    "all_caps_words", "uppercase_ratio", "total_punctuation", "avg_word_length",
    "helpful_vote", "int_verified_purchase", "hour", "month", "season"
]

X = df[feature_cols].values
y = df["sentiment"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = GaussianNB()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

print("Confusion matrix:\n", cm)

Accuracy: 0.3698
Precision: 0.4317
Recall: 0.3698
F1-Score: 0.2810
Confusion matrix:
 [[1115   84   44]
 [ 996  138   75]
 [1047   73  108]]
