In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [83]:
df = pd.read_csv('supermarket.csv')
df.tail(3)

Unnamed: 0,SHOP_WEEK,SHOP_DATE,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND,PROD_CODE,PROD_CODE_10,PROD_CODE_20,PROD_CODE_30,...,CUST_PRICE_SENSITIVITY,CUST_LIFESTAGE,BASKET_ID,BASKET_SIZE,BASKET_PRICE_SENSITIVITY,BASKET_TYPE,BASKET_DOMINANT_MISSION,STORE_CODE,STORE_FORMAT,STORE_REGION
578079,200637,20061109,5,16,1,4.47,PRD0900396,CL00229,DEP00081,G00027,...,UM,,994103100229855,L,MM,Full Shop,Mixed,STORE00001,LS,E02
578080,200717,20070624,1,10,1,4.92,PRD0903065,CL00229,DEP00081,G00027,...,UM,YF,994106300437841,L,MM,Full Shop,Mixed,STORE00001,LS,E02
578081,200744,20071229,7,14,1,9.27,PRD0901923,CL00229,DEP00081,G00027,...,UM,,994109000359411,L,MM,Top Up,Mixed,STORE00001,LS,E02


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 578082 entries, 0 to 578081
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SHOP_WEEK                 578082 non-null  int64  
 1   SHOP_DATE                 578082 non-null  int64  
 2   SHOP_WEEKDAY              578082 non-null  int64  
 3   SHOP_HOUR                 578082 non-null  int64  
 4   QUANTITY                  578082 non-null  int64  
 5   SPEND                     578082 non-null  float64
 6   PROD_CODE                 578082 non-null  object 
 7   PROD_CODE_10              578082 non-null  object 
 8   PROD_CODE_20              578082 non-null  object 
 9   PROD_CODE_30              578082 non-null  object 
 10  PROD_CODE_40              578082 non-null  object 
 11  CUST_CODE                 492494 non-null  object 
 12  CUST_PRICE_SENSITIVITY    492494 non-null  object 
 13  CUST_LIFESTAGE            433142 non-null  o

In [85]:
# Found missing values on CUST_CODE, CUST_PRICE_SENSITIVITY, CUST_LIFESTAGE columns
df.isna().sum()

SHOP_WEEK                        0
SHOP_DATE                        0
SHOP_WEEKDAY                     0
SHOP_HOUR                        0
QUANTITY                         0
SPEND                            0
PROD_CODE                        0
PROD_CODE_10                     0
PROD_CODE_20                     0
PROD_CODE_30                     0
PROD_CODE_40                     0
CUST_CODE                    85588
CUST_PRICE_SENSITIVITY       85588
CUST_LIFESTAGE              144940
BASKET_ID                        0
BASKET_SIZE                      0
BASKET_PRICE_SENSITIVITY         0
BASKET_TYPE                      0
BASKET_DOMINANT_MISSION          0
STORE_CODE                       0
STORE_FORMAT                     0
STORE_REGION                     0
dtype: int64

In [86]:
# Clean missing values
df.dropna(subset=['CUST_CODE'], inplace=True)
df['CUST_LIFESTAGE'] = df['CUST_LIFESTAGE'].fillna('XX')
df.isna().sum()

SHOP_WEEK                   0
SHOP_DATE                   0
SHOP_WEEKDAY                0
SHOP_HOUR                   0
QUANTITY                    0
SPEND                       0
PROD_CODE                   0
PROD_CODE_10                0
PROD_CODE_20                0
PROD_CODE_30                0
PROD_CODE_40                0
CUST_CODE                   0
CUST_PRICE_SENSITIVITY      0
CUST_LIFESTAGE              0
BASKET_ID                   0
BASKET_SIZE                 0
BASKET_PRICE_SENSITIVITY    0
BASKET_TYPE                 0
BASKET_DOMINANT_MISSION     0
STORE_CODE                  0
STORE_FORMAT                0
STORE_REGION                0
dtype: int64

In [87]:
df[['QUANTITY', 'SPEND']].describe().apply(lambda x: x.apply('{0:.5f}'.format)).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
QUANTITY,492494.0,1.52764,1.57281,1.0,1.0,1.0,1.0,129.0
SPEND,492494.0,1.92788,2.69127,0.01,0.77,1.24,2.11,346.46


In [88]:
# clean data type
df['SHOP_DATE'] = pd.to_datetime(df['SHOP_DATE'], format='%Y%m%d')
df['SHOP_MONTH'] = df['SHOP_DATE'].dt.to_period('m')

In [89]:
print(f"min date :{df['SHOP_DATE'].min()}")
print(f"max date :{df['SHOP_DATE'].max()}")

min date :2006-04-10 00:00:00
max date :2008-07-06 00:00:00


In [90]:
df_2008q1 = df.loc[(df['SHOP_MONTH']>='2008-01') & (df['SHOP_MONTH']<='2008-03')]
df_2008m4 = df.loc[(df['SHOP_MONTH']=='2008-04')][['CUST_CODE']].drop_duplicates()

In [119]:
# create feature df
df_f = df_2008q1[['CUST_CODE']].drop_duplicates().reset_index(drop=True)

df_f = df_f.merge(
    df_2008q1.groupby(['CUST_CODE', 'BASKET_ID']).agg(
        spend = ('SPEND', 'sum'),
        date_min = ('SHOP_DATE', 'min'),
        date_max = ('SHOP_DATE', 'max')).reset_index().groupby('CUST_CODE').agg(bkt_size = ('spend', 'mean'), 
                                                                    num_trans = ('spend', 'count'),
                                                                    total_spend = ('spend', 'sum'),
                                                                    date_min = ('date_min', 'min'),
                                                                    date_max = ('date_max', 'max')
                                                                    ),
    how='left', on='CUST_CODE'
)

df_f = df_f.merge(
    df_2008q1.groupby('CUST_CODE').agg(
        num_date = ('SHOP_DATE', 'nunique'),
        num_week = ('SHOP_WEEK', 'nunique')),
    how='left', on='CUST_CODE'
)

df_f = df_f.merge(
    df_2008q1.loc[df_2008q1['SHOP_MONTH'] > (df_2008q1['SHOP_MONTH'].max() - 1), :].groupby(['CUST_CODE', 'BASKET_ID']).agg(
        spend = ('SPEND', 'sum')).reset_index().groupby('CUST_CODE').agg(bkt_size_1m = ('spend', 'mean'), 
                                                                    num_trans_1m = ('spend', 'count'),
                                                                    total_spend_1m = ('spend', 'sum')
                                                                    ),
    how='left', on='CUST_CODE'
)

df_f = df_f.merge(
    df_2008q1.loc[df_2008q1['SHOP_MONTH'] > (df_2008q1['SHOP_MONTH'].max() - 2), :].groupby(['CUST_CODE', 'BASKET_ID']).agg(
        spend = ('SPEND', 'sum')).reset_index().groupby('CUST_CODE').agg(bkt_size_2m = ('spend', 'mean'), 
                                                                    num_trans_2m = ('spend', 'count'),
                                                                    total_spend_2m = ('spend', 'sum')
                                                                    ),
    how='left', on='CUST_CODE'
)

df_f['tbp'] = (df_f['date_max']-df_f['date_min']).dt.days / df_f['num_trans']
df_f['recency'] = (df_2008q1['SHOP_DATE'].max()-df_f['date_max']).dt.days

df_f.drop(columns=['date_min', 'date_max'], inplace=True)

# fill null values
df_f = df_f.fillna(0)

df_f.tail(3)

Unnamed: 0,CUST_CODE,bkt_size,num_trans,total_spend,num_date,num_week,bkt_size_1m,num_trans_1m,total_spend_1m,bkt_size_2m,num_trans_2m,total_spend_2m,tbp,recency
1302,CUST0000218304,1.54,1,1.54,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67
1303,CUST0000280015,1.11,1,1.11,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75
1304,CUST0000259310,0.03,1,0.03,1,1,0.03,1.0,0.03,0.03,1.0,0.03,0.0,19


In [120]:
df_f.describe()

Unnamed: 0,bkt_size,num_trans,total_spend,num_date,num_week,bkt_size_1m,num_trans_1m,total_spend_1m,bkt_size_2m,num_trans_2m,total_spend_2m,tbp,recency
count,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0
mean,13.655547,5.629119,77.47846,5.245211,3.796935,9.597346,1.996935,28.201241,11.904624,3.817625,53.289027,5.929084,27.231418
std,14.512916,8.344504,122.952007,7.126354,3.780113,14.535293,3.1097,48.500525,14.609769,5.653369,86.149215,7.227038,25.826553
min,0.01,1.0,0.01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.34,1.0,5.64,1.0,1.0,0.0,0.0,0.0,1.37,1.0,1.54,0.0,5.0
50%,9.212222,2.0,25.82,2.0,2.0,3.145,1.0,4.4,6.76,1.0,16.44,3.913043,19.0
75%,19.35,6.0,86.15,6.0,5.0,13.815,2.0,36.59,16.68,4.0,60.94,9.833333,45.0
max,154.87,78.0,870.98,52.0,14.0,107.88,26.0,397.68,123.24,47.0,616.94,43.0,90.0


In [121]:
df_2008m4['purchase'] = 1
df_f = pd.merge(df_f, df_2008m4[['CUST_CODE', 'purchase']], how='left', on='CUST_CODE')
df_f['purchase'] = df_f['purchase'].fillna(0)
df_f.tail(3)

Unnamed: 0,CUST_CODE,bkt_size,num_trans,total_spend,num_date,num_week,bkt_size_1m,num_trans_1m,total_spend_1m,bkt_size_2m,num_trans_2m,total_spend_2m,tbp,recency,purchase
1302,CUST0000218304,1.54,1,1.54,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67,0.0
1303,CUST0000280015,1.11,1,1.11,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75,1.0
1304,CUST0000259310,0.03,1,0.03,1,1,0.03,1.0,0.03,0.03,1.0,0.03,0.0,19,0.0


In [122]:
X = df_f.drop(columns=['CUST_CODE', 'purchase'])
y = df_f['purchase']

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.67      0.74      0.70       122
         1.0       0.75      0.68      0.71       139

    accuracy                           0.71       261
   macro avg       0.71      0.71      0.71       261
weighted avg       0.71      0.71      0.71       261



In [124]:
# Create a Random Forest classifier
clf = RandomForestClassifier()

# Train the classifier on your data
clf.fit(X, y)

# Get the feature importance scores
importance_scores = clf.feature_importances_

# Sort the features based on importance in descending order
sorted_indices = np.argsort(importance_scores)[::-1]

# Print the feature rankings
for i, idx in enumerate(sorted_indices):
    print(f"Rank {i+1}: Feature {df_f.columns[idx+1]}, Importance score: {importance_scores[idx]}")

Rank 1: Feature total_spend, Importance score: 0.13462369141229288
Rank 2: Feature bkt_size, Importance score: 0.1160392709157522
Rank 3: Feature recency, Importance score: 0.0979680190882244
Rank 4: Feature num_trans, Importance score: 0.0814271070082891
Rank 5: Feature tbp, Importance score: 0.079378064155758
Rank 6: Feature num_week, Importance score: 0.07869217885235823
Rank 7: Feature total_spend_2m, Importance score: 0.07793115007424935
Rank 8: Feature bkt_size_2m, Importance score: 0.07396457915449381
Rank 9: Feature num_date, Importance score: 0.06739099285625309
Rank 10: Feature num_trans_2m, Importance score: 0.06230845727818195
Rank 11: Feature total_spend_1m, Importance score: 0.056465607953543474
Rank 12: Feature bkt_size_1m, Importance score: 0.04372170207237619
Rank 13: Feature num_trans_1m, Importance score: 0.030089179178227236
