In [1]:
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [9]:

df = pd.read_excel("Online Retail.xlsx")


In [3]:

df['InvoiceNo'] = pd.to_numeric(df['InvoiceNo'], errors='coerce')
df = df.dropna()
df = df[df['InvoiceNo'] >= 0]  
df = df[df['Quantity'] >= 0]  
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(str)  # ensure consistency
df['line_total'] = df['Quantity'] * df['UnitPrice']
df['day'] = df['InvoiceDate'].dt.day_name()
df['month'] = df['InvoiceDate'].dt.month
df['year'] = df['InvoiceDate'].dt.year
df['time_24h'] = df['InvoiceDate'].dt.strftime('%H:%M')
total_spend = df.groupby('CustomerID')['line_total'].sum().reset_index()
total_spend.rename(columns={'line_total': 'total_spend'}, inplace=True)
purchase_freq = df.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
purchase_freq.rename(columns={'InvoiceNo': 'purchase_frequency'}, inplace=True)
customer_df = total_spend.merge(purchase_freq, on='CustomerID')
customer_df['avg_order_value'] = customer_df['total_spend'] / customer_df['purchase_frequency']
df = df.head(10000)
df["month"]

0        12
1        12
2        12
3        12
4        12
         ..
14329    12
14330    12
14331    12
14393    12
14394    12
Name: month, Length: 10000, dtype: int32

In [4]:

features = df[["InvoiceNo","Quantity","UnitPrice","CustomerID","line_total","month"]].dropna()

# ✅ Split data
X_train, X_test = train_test_split(features, test_size=0.33, random_state=0)

# ✅ Normalize data
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)

In [5]:
# ✅ Try different values of k and calculate silhouette scores
K = 2
fits = []
scores = []

In [6]:
k = 5
model = KMeans(n_clusters=k, random_state=1, n_init=1).fit(X_train_norm)
fits.append(model)
score = silhouette_score(X_train_norm, model.labels_)
scores.append(score)
print(f"k={k}, Silhouette Score={score:.3f}")
# fits = []
# scores = []

# for k in range(2, 8):  # try k=2 to k=7
#     model = KMeans(n_clusters=k, random_state=1, n_init=1).fit(X_train_norm)
#     fits.append(model)
#     score = silhouette_score(X_train_norm, model.labels_)
#     scores.append(score)
#     print(f"k={k}, Silhouette Score={score:.3f}")



k=5, Silhouette Score=0.645


In [8]:
initial_type = [('float_input', FloatTensorType([None, features.shape[1]]))]
onx = convert_sklearn(model, initial_types=initial_type,)
with open("model.onnx", "wb") as f:
    f.write(onx.SerializeToString())
print("✅ ONNX model saved as model.onnx")

✅ ONNX model saved as model.onnx
