In [1]:
import os
import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1) Load raw dataset
df = pd.read_csv("customer_sales_raw.csv")

# 2) Target = total_value (sales revenue)
y = df["total_value"]

# Features (drop ID/date/free text fields)
num_cols = ["price", "quantity", "age", "tenure_months"]
cat_cols = ["gender", "region", "segment", "product_name", "category", "sentiment"]

X = df[num_cols + cat_cols].copy()

# 3) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) Preprocessing
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols),
])

# 5) Model
linreg = LinearRegression()

pipe = Pipeline([
    ("prep", preprocess),
    ("model", linreg)
])

# 6) Train
pipe.fit(X_train, y_train)

# 7) Evaluate
y_pred = pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

# 8) Save model
os.makedirs("models", exist_ok=True)
out_path = "models/sales_linear_reg.pkl"
with open(out_path, "wb") as f:
    pickle.dump(pipe, f)

print(f"✅ Linear Regression model saved to {out_path}")


RMSE: 29217.93
R²: 0.848
✅ Linear Regression model saved to models/sales_linear_reg.pkl
