## using model for sementation


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, āhere's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

import time, warnings
import datetime as dt

#visualizations
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline
import seaborn as sns

warnings.filterwarnings("ignore")

In [3]:
#load the dataset
retail_df = pd.read_csv('data.csv',encoding="ISO-8859-1",dtype={'CustomerID': str,'InvoiceID': str})
retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom


In [4]:
#remove canceled orders
retail_uk = retail_df[retail_df['Quantity']>0]
retail_uk.shape

(531285, 8)

In [5]:
#remove rows where customerID are NA
retail_uk.dropna(subset=['CustomerID'],how='all',inplace=True)
retail_uk.shape

(397924, 8)

In [6]:
#restrict the data to one full year because it's better to use a metric per Months or Years in RFM
retail_uk = retail_uk[retail_uk['InvoiceDate']>= "2010-12-09"]
retail_uk.shape

(199198, 8)

## first imp **

In [8]:
print("Summary..")
#exploring the unique values of each attribute
print("Number of transactions: ", retail_uk['InvoiceNo'].nunique())
print("Number of products bought: ",retail_uk['StockCode'].nunique())
print("Number of customers:", retail_uk['CustomerID'].nunique() )
print("Percentage of customers NA: ", round(retail_uk['CustomerID'].isnull().sum() * 100 / len(retail_df),2),"%" )

Summary..
Number of transactions:  9786
Number of products bought:  3324
Number of customers: 3165
Percentage of customers NA:  0.0 %


## RFM start


In [10]:
retail_uk['InvoiceDate'].max()

'9/9/2011 9:52'

In [11]:
now = dt.date(2011,12,9)


In [12]:
#create a new column called date which contains the date of invoice only
retail_uk['date'] = pd.DatetimeIndex(retail_uk['InvoiceDate']).date

In [13]:
retail_uk.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,date
105335,545220,21955,DOORMAT UNION JACK GUNS AND ROSES,2,3/1/2011 8:30,7.95,14620,United Kingdom,2011-03-01
105336,545220,48194,DOORMAT HEARTS,2,3/1/2011 8:30,7.95,14620,United Kingdom,2011-03-01
105337,545220,22556,PLASTERS IN TIN CIRCUS PARADE,12,3/1/2011 8:30,1.65,14620,United Kingdom,2011-03-01
105338,545220,22139,RETROSPOT TEA SET CERAMIC 11 PC,3,3/1/2011 8:30,4.95,14620,United Kingdom,2011-03-01
105339,545220,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,4,3/1/2011 8:30,3.75,14620,United Kingdom,2011-03-01


In [14]:
#group by customers and check last date of purshace
recency_df = retail_uk.groupby(by='CustomerID', as_index=False)['date'].max()
recency_df.columns = ['CustomerID','LastPurshaceDate']
recency_df.head()

Unnamed: 0,CustomerID,LastPurshaceDate
0,12347,2011-08-02
1,12348,2011-09-25
2,12352,2011-09-28
3,12353,2011-05-19
4,12354,2011-04-21


In [15]:

#calculate recency
recency_df['Recency'] = recency_df['LastPurshaceDate'].apply(lambda x: (now - x).days)

In [16]:
#drop LastPurchaseDate as we don't need it anymore
recency_df.drop('LastPurshaceDate',axis=1,inplace=True)

In [17]:
# drop duplicates
retail_uk_copy = retail_uk
retail_uk_copy.drop_duplicates(subset=['InvoiceNo', 'CustomerID'], keep="first", inplace=True)
#calculate frequency of purchases
frequency_df = retail_uk_copy.groupby(by=['CustomerID'], as_index=False)['InvoiceNo'].count()
frequency_df.columns = ['CustomerID','Frequency']
frequency_df.head()

Unnamed: 0,CustomerID,Frequency
0,12347,3
1,12348,2
2,12352,6
3,12353,1
4,12354,1


In [18]:
#create column total cost
retail_uk['TotalCost'] = retail_uk['Quantity'] * retail_uk['UnitPrice']

In [19]:
monetary_df = retail_uk.groupby(by='CustomerID',as_index=False).agg({'TotalCost': 'sum'})
monetary_df.columns = ['CustomerID','Monetary']
monetary_df.head()

Unnamed: 0,CustomerID,Monetary
0,12347,55.16
1,12348,250.0
2,12352,498.8
3,12353,19.9
4,12354,20.8


In [20]:
#merge recency dataframe with frequency dataframe
temp_df = recency_df.merge(frequency_df,on='CustomerID')
temp_df.head()

Unnamed: 0,CustomerID,Recency,Frequency
0,12347,129,3
1,12348,75,2
2,12352,72,6
3,12353,204,1
4,12354,232,1


In [21]:
#merge with monetary dataframe to get a table with the 3 columns
rfm_df = temp_df.merge(monetary_df,on='CustomerID')
#use CustomerID as index
rfm_df.set_index('CustomerID',inplace=True)
#check the head
rfm_df.head()
temp = rfm_df.copy()


Unnamed: 0_level_0,Recency,Frequency,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12347,129,3,55.16
12348,75,2,250.00
12352,72,6,498.80
12353,204,1,19.90
12354,232,1,20.80
...,...,...,...
18280,277,1,23.70
18281,180,1,5.04
18282,126,1,12.75
18283,95,7,35.95


## Second Imp **

In [22]:
temp

Unnamed: 0_level_0,Recency,Frequency,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12347,129,3,55.16
12348,75,2,250.00
12352,72,6,498.80
12353,204,1,19.90
12354,232,1,20.80
...,...,...,...
18280,277,1,23.70
18281,180,1,5.04
18282,126,1,12.75
18283,95,7,35.95


## Loading model and using it for prediction

In [25]:
from tensorflow.keras.models import load_model
model = load_model('RFMSegmentationModel.keras')

In [26]:
predictions = model.predict(temp)



In [28]:
import joblib
# Load the label encoder
le = joblib.load('label_encoder.pkl')

In [29]:
# Get the index of the maximum probability
max_prob_indices = np.argmax(predictions, axis=1)

# Transform the indices back to original form
original_labels = le.inverse_transform(max_prob_indices)

In [30]:
temp['Labels'] =original_labels

## Third Imp ***

In [31]:
temp

Unnamed: 0_level_0,Recency,Frequency,Monetary,Labels
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12347,129,3,55.16,Lost Customers
12348,75,2,250.00,Other
12352,72,6,498.80,Best Customers
12353,204,1,19.90,Lost Customers
12354,232,1,20.80,Lost Customers
...,...,...,...,...
18280,277,1,23.70,Lost Customers
18281,180,1,5.04,Lost Customers
18282,126,1,12.75,Lost Customers
18283,95,7,35.95,Other


In [33]:
temp['Labels'].unique()

array(['Lost Customers', 'Other', 'Best Customers', 'Almost Lost',
       'Loyal Customers', 'Big Spenders'], dtype=object)

In [34]:
unique_count = temp['Labels'].nunique()
print(unique_count)

6


## Fourth Imp ***

In [35]:
unique_counts = temp['Labels'].value_counts()
print(unique_counts)

Labels
Lost Customers     1630
Almost Lost         595
Other               385
Best Customers      381
Big Spenders        110
Loyal Customers      64
Name: count, dtype: int64
