# 1.Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# =======================
# 1. Chuẩn bị dữ liệu (giống như trước)
# =======================
file_path = "VNPTEST01.xlsx"

customer_df = pd.read_excel(file_path, sheet_name="customer")
order_df = pd.read_excel(file_path, sheet_name="sale_order")
order_details_df = pd.read_excel(file_path, sheet_name="order_details")
product_df = pd.read_excel(file_path, sheet_name="product")

# Join order_details + product để lấy price
order_details_df = order_details_df.merge(product_df, on="product_id", how="left")

# Revenue = quantity * price
order_details_df["revenue"] = order_details_df["quantity"] * order_details_df["price"]

# Join với sale_order để lấy customer_id
order_details_df = order_details_df.merge(order_df, on="order_id", how="left")

# Aggregate revenue theo customer_id
customer_revenue = order_details_df.groupby("customer_id")["revenue"].sum().reset_index()

# Merge với bảng customer
customer_full = customer_df.merge(customer_revenue, on="customer_id", how="left")
customer_full["revenue"] = customer_full["revenue"].fillna(0)

# Encode gender nếu có
if "gender" in customer_full.columns:
    customer_full["gender"] = customer_full["gender"].map({"Male": 1, "Female": 0})



In [3]:
# =======================
# 2. Linear Regression
# =======================
X = customer_full.drop(columns=["customer_id", "name", "revenue"], errors="ignore")
y = customer_full["revenue"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred_lin)
mse = mean_squared_error(y_test, y_pred_lin)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_lin)

print("\n📌 Linear Regression Metrics:")
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.2f}")

# =======================



ValueError: could not convert string to float: 'North George'

In [None]:
# 3. Logistic Regression
# =======================
# Tạo nhãn phân loại High/Low revenue
threshold = customer_full["revenue"].median()
customer_full["revenue_class"] = (customer_full["revenue"] > threshold).astype(int)

X_cls = customer_full.drop(columns=["customer_id", "name", "revenue", "revenue_class"], errors="ignore")
y_cls = customer_full["revenue_class"]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_cls, y_train_cls)
y_pred_cls = log_model.predict(X_test_cls)
y_pred_prob = log_model.predict_proba(X_test_cls)[:,1]



In [None]:
# Metrics
acc = accuracy_score(y_test_cls, y_pred_cls)
prec = precision_score(y_test_cls, y_pred_cls)
rec = recall_score(y_test_cls, y_pred_cls)
f1 = f1_score(y_test_cls, y_pred_cls)
roc_auc = roc_auc_score(y_test_cls, y_pred_prob)

print("\n📌 Logistic Regression Metrics:")
print(f"Accuracy  : {acc:.2f}")
print(f"Precision : {prec:.2f}")
print(f"Recall    : {rec:.2f}")
print(f"F1-score  : {f1:.2f}")
print(f"ROC-AUC   : {roc_auc:.2f}")