# 🧠 Step 2: Feature Engineering – Behavioural Scores
This notebook creates six behaviour features per customer using their transaction history.

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('online_retail_cleaned.csv')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Customer ID'] = df['Customer ID'].astype(int)

In [4]:
# Compute behavioural features
ref_date = df['InvoiceDate'].max()
top_300 = df.groupby('StockCode')['Quantity'].sum().nlargest(300).index
returns = df[df['Quantity'] < 0].groupby('Customer ID')['Quantity'].count()
behaviour = df.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (ref_date - x.max()).days,
    'Invoice': 'nunique',
    'TotalPrice': 'sum',
    'StockCode': ['nunique', lambda x: (x.isin(top_300)).sum()]
})
behaviour.columns = ['Recency', 'Frequency', 'Monetary', 'ProductVariety', 'Top300Hits']
behaviour['Returns'] = behaviour.index.map(returns).fillna(0)

In [5]:
# Normalize features
log_cols = ['Recency', 'Frequency', 'Monetary', 'ProductVariety', 'Top300Hits', 'Returns']
behaviour_log = np.log1p(behaviour[log_cols])
scaler = MinMaxScaler()
behaviour_scaled = pd.DataFrame(scaler.fit_transform(behaviour_log), columns=log_cols, index=behaviour.index)
behaviour_scaled.reset_index(inplace=True)
behaviour_scaled.to_csv('customer_behaviour_scores.csv', index=False)
behaviour_scaled.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,ProductVariety,Top300Hits,Returns
0,12346,0.876099,0.353449,0.830763,0.369042,0.219905,0.0
1,12347,0.104938,0.284012,0.599042,0.580476,0.504534,0.0
2,12348,0.65364,0.207449,0.5242,0.358679,0.421458,0.0
3,12349,0.445769,0.173022,0.590176,0.593102,0.477649,0.0
4,12350,0.86848,0.0,0.373283,0.307257,0.219905,0.0
