In [1]:
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [2]:
df = pd.read_excel("datasets/Online Retail.xlsx")

In [3]:
df['InvoiceNo'] = pd.to_numeric(df['InvoiceNo'], errors='coerce')
df = df.dropna()
df = df[df['InvoiceNo'] >= 0]  
df = df[df['Quantity'] >= 0]  
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(str)  # ensure consistency
df['line_total'] = df['Quantity'] * df['UnitPrice']
df['day'] = df['InvoiceDate'].dt.day_name()
df['month'] = df['InvoiceDate'].dt.month
df['year'] = df['InvoiceDate'].dt.year
df['time_24h'] = df['InvoiceDate'].dt.strftime('%H:%M')

In [4]:
total_spend = df.groupby('CustomerID')['line_total'].sum().reset_index()
total_spend.rename(columns={'line_total': 'total_spend'}, inplace=True)
purchase_freq = df.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
purchase_freq.rename(columns={'InvoiceNo': 'purchase_frequency'}, inplace=True)
customer_df = total_spend.merge(purchase_freq, on='CustomerID')
customer_df['avg_order_value'] = customer_df['total_spend'] / customer_df['purchase_frequency']

In [5]:
df = df.head(10000)

In [6]:
features = df[["InvoiceNo","Quantity","UnitPrice","CustomerID","line_total","month"]].dropna()
X_train, X_test = train_test_split(features, test_size=0.33, random_state=0)
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)

In [7]:
fits = []
scores = []

for k in range(2, 8):  # try k=2 to k=7
    model = KMeans(n_clusters=k, random_state=1, n_init=1).fit(X_train_norm)
    fits.append(model)
    score = silhouette_score(X_train_norm, model.labels_)
    scores.append(score)
    print(f"k={k}, Silhouette Score={score:.3f}")

k=2, Silhouette Score=0.614
k=3, Silhouette Score=0.643
k=4, Silhouette Score=0.637
k=5, Silhouette Score=0.645
k=6, Silhouette Score=0.632
k=7, Silhouette Score=0.592


In [8]:
k = 5
model = KMeans(n_clusters=k, random_state=1, n_init=1).fit(X_train_norm)
fits.append(model)
score = silhouette_score(X_train_norm, model.labels_)
scores.append(score)
print(f"k={k}, Silhouette Score={score:.3f}")

k=5, Silhouette Score=0.645


In [None]:
initial_type = [('float_input', FloatTensorType([None, features.shape[1]]))]
onx = convert_sklearn(model, initial_types=initial_type,)
with open("models/model.onnx", "wb") as f:
    f.write(onx.SerializeToString())
print("✅ ONNX model saved as model.onnx")

✅ ONNX model saved as model.onnx
