In [None]:
!pip install numpy==1.26.4
!pip install scikit-surprise



In [9]:
import pandas as pd
import numpy as np
import datetime as dt

print("Phase 1: Data Preparation & Cleaning")

def load_and_clean_data(file_path):

    df = pd.read_csv(file_path, encoding="latin-1")

    df = df.dropna(subset=["CustomerID"])
    df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]

    df["CustomerID"] = df["CustomerID"].astype(int)
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

    df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]

    # User–Item Matrix
    user_item_matrix = df.pivot_table(
        index="CustomerID",
        columns="StockCode",
        values="Quantity",
        aggfunc="sum",
        fill_value=0
    )

    # RFM Feature Engineering
    today = df["InvoiceDate"].max() + dt.timedelta(days=1)

    rfm_df = df.groupby("CustomerID").agg(
        Recency=("InvoiceDate", lambda x: (today - x.max()).days),
        Frequency=("InvoiceDate", "count"),
        Monetary=("TotalPrice", "sum")
    ).reset_index()

    # Target variable
    rfm_df["Target_Binary_Purchase"] = np.where(
        rfm_df["Recency"] <= 30, 1, 0
    )

    return df, user_item_matrix, rfm_df


df_cleaned, user_item_matrix, rfm_df = load_and_clean_data("online_retail.csv")

print("User–Item Matrix (sample):")
print(user_item_matrix.head())


Phase 1: Data Preparation & Cleaning
User–Item Matrix (sample):
StockCode   10002  10080  10120  10123C  10124A  10124G  10125  10133  10135  \
CustomerID                                                                     
12346           0      0      0       0       0       0      0      0      0   
12347           0      0      0       0       0       0      0      0      0   
12348           0      0      0       0       0       0      0      0      0   
12349           0      0      0       0       0       0      0      0      0   
12350           0      0      0       0       0       0      0      0      0   

StockCode   11001  ...  90214V  90214W  90214Y  90214Z  BANK CHARGES  C2  DOT  \
CustomerID         ...                                                          
12346           0  ...       0       0       0       0             0   0    0   
12347           0  ...       0       0       0       0             0   0    0   
12348           0  ...       0       0       0     

Collaborative Filtering Model Training (k-NN and SVD)

In [None]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader
from surprise import SVD

print("Phase 2: Collaborative Filtering Models")

# 1) Item-Based Collaborative Filtering (k-NN)

print("\nTraining Item-Based k-NN Model...")

item_user_matrix = csr_matrix(user_item_matrix.T)

knn_model = NearestNeighbors(
    metric="cosine",
    algorithm="brute",
    n_neighbors=5
)

knn_model.fit(item_user_matrix)

print("k-NN Model trained successfully")
#example
item_index = 0  # first item
distances, indices = knn_model.kneighbors(
    item_user_matrix[item_index],
    n_neighbors=5
)

print("Nearest item indices:", indices)
print("Similarity distances:", distances)

# 2)Matrix Factorization (SVD)
print("\nTraining SVD Model...")


print("SVD Model trained successfully")

# Convert user-item matrix to sparse format
user_item_sparse = csr_matrix(user_item_matrix)

svd_model = TruncatedSVD(
    n_components=50,
    random_state=42
)

# Fit the SVD model before transforming
svd_model.fit(user_item_sparse)

user_latent = svd_model.transform(user_item_sparse)

print("User latent matrix shape:", user_latent.shape)
print(user_latent[:2])

Phase 2: Collaborative Filtering Models

Training Item-Based k-NN Model...
k-NN Model trained successfully
Nearest item indices: [[   0    6 2050 2052 1029]]
Similarity distances: [[0.         0.16020535 0.20586033 0.22930196 0.29521966]]

Training SVD Model...
SVD Model trained successfully
User latent matrix shape: (4097, 50)
[[ 7.42149737e+04 -3.06243117e+01  3.65334861e+01 -3.82992933e+00
   2.75088668e-01 -3.01485395e+00  1.00644011e+01 -1.61435250e+00
  -1.24605556e+01 -4.53064409e-02  2.16784673e+00 -1.13844646e+01
  -5.69107610e+00  1.68726236e+01 -1.65808588e+01  5.85590811e-01
  -1.17883937e+00  1.73044549e+01  7.70280901e-01  1.70176039e+00
  -2.65212221e+00  2.55866228e+00  5.92415900e+00 -8.37210409e+00
  -8.03706104e+00 -1.98966982e-01 -5.38888261e+00 -1.18577581e+00
   5.83633101e-01  5.16489580e+00  2.86778118e+00  8.50491949e-01
  -3.29997252e+00 -2.09528043e-01 -1.55300448e+00  3.13633557e+00
   1.64805050e+00  8.84045349e-01  2.29912413e+00 -1.34644852e+00
   5.81748

Supervised Model Training and Evaluation


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# SUPERVISED LEARNING MODELS
print(" Phase 3: Supervised Learning & Evaluation ")

features = rfm_df[['Frequency', 'Monetary']]
target = rfm_df['Target_Binary_Purchase']

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.3, random_state=42
)
print("Supervised data split complete.")

# Logistic Regression
print("\nTraining Logistic Regression Model...")
log_reg_model = LogisticRegression(
    C=1.0,
    solver='lbfgs',
    max_iter=100,
    random_state=42
)
log_reg_model.fit(X_train, y_train)

y_pred_log = log_reg_model.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)

print("Logistic Regression Results")
print(f"Accuracy: {accuracy_log:.4f}")
print(f"\nStrength: Strong precision and recall, best F1-score among supervised models.\n")
print(classification_report(y_test, y_pred_log))

# Decision Tree
print("\nTraining Decision Tree Model...")
dt_model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    criterion='gini',
    random_state=42
)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print(" Decision Tree Results ")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"\nLimitation: More sensitive to noise and prone to overfitting.\n")
print(classification_report(y_test, y_pred_dt))

 Phase 3: Supervised Learning & Evaluation 
Supervised data split complete.

Training Logistic Regression Model...
Logistic Regression Results
Accuracy: 0.7000

Strength: Strong precision and recall, best F1-score among supervised models.

              precision    recall  f1-score   support

           0       0.70      0.93      0.80       797
           1       0.68      0.28      0.40       433

    accuracy                           0.70      1230
   macro avg       0.69      0.61      0.60      1230
weighted avg       0.69      0.70      0.66      1230


Training Decision Tree Model...
 Decision Tree Results 
Accuracy: 0.6797

Limitation: More sensitive to noise and prone to overfitting.

              precision    recall  f1-score   support

           0       0.72      0.83      0.77       797
           1       0.56      0.40      0.47       433

    accuracy                           0.68      1230
   macro avg       0.64      0.62      0.62      1230
weighted avg       0.66

Final Conclusion


In [None]:

print("WORKFLOW COMPLETE: All models trained and evaluated.")
print("\nSummary of Findings :")
print("* Hybrid Solution: A combination of Collaborative Filtering, SVD, and Logistic Regression provides the strongest performance.")
print("* Logistic Regression  gave the most reliable and balanced performance among supervised models.")
print("* SVD (Matrix Factorization) worked effectively by learning hidden relationships between users and products.")
print("---")
print("Next Steps: Implement a Hybrid System  using these models for production.")

WORKFLOW COMPLETE: All models trained and evaluated.

Summary of Findings :
* Hybrid Solution: A combination of Collaborative Filtering, SVD, and Logistic Regression provides the strongest performance.
* Logistic Regression  gave the most reliable and balanced performance among supervised models.
* SVD (Matrix Factorization) worked effectively by learning hidden relationships between users and products.
---
Next Steps: Implement a Hybrid System  using these models for production.
