In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [2]:
"""
Given the following dataset, classify customer value based on order frequency. 

You can assign them a score of 1-3, with 1 being least valuable and 3 being most valuable.

For example, a customer who has made 10 purchases is more valuable than one that has made 
2 purchases across the same time period.

For more information regarding customer lifetime value you can read this post.
https://www.shopify.com/blog/customer-lifetime-value
"""

data = pd.read_csv('./data/Online_Retail.csv')
data.sample(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
10209,537235,22659,LUNCH BOX I LOVE LONDON,324,12/6/10 9:45,1.65,17381.0,United Kingdom
26803,538524,20754,RETROSPOT RED WASHING UP GLOVES,4,12/13/10 9:35,4.21,,United Kingdom
41936,539958,22031,BOTANICAL LAVENDER BIRTHDAY CARD,11,12/23/10 13:26,0.43,,United Kingdom
21523,538093,20679,EDWARDIAN PARASOL RED,12,12/9/10 14:49,5.95,12682.0,France
2927,536592,82552,WASHROOM METAL SIGN,1,12/1/10 17:06,2.51,,United Kingdom
52302,540703,20718,RED RETROSPOT SHOPPER BAG,10,1/11/11 9:58,1.25,14135.0,United Kingdom
19083,537830,22423,REGENCY CAKESTAND 3 TIER,1,12/8/10 14:46,12.75,13649.0,United Kingdom
2998,536592,85123A,WHITE HANGING HEART T-LIGHT HOLDER,9,12/1/10 17:06,5.91,,United Kingdom
16231,537645,DOT,DOTCOM POSTAGE,1,12/7/10 15:34,607.96,,United Kingdom
29687,538814,22525,CHILDRENS GARDEN GLOVES PINK,1,12/14/10 12:30,1.25,15983.0,United Kingdom


In [3]:
data.shape

(65499, 8)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65499 entries, 0 to 65498
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    65499 non-null  object 
 1   StockCode    65499 non-null  object 
 2   Description  65333 non-null  object 
 3   Quantity     65499 non-null  int64  
 4   InvoiceDate  65499 non-null  object 
 5   UnitPrice    65499 non-null  float64
 6   CustomerID   40218 non-null  float64
 7   Country      65499 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 4.0+ MB


In [5]:
data_cust = data[~data.CustomerID.isna()].copy()
data_cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40218 entries, 0 to 65101
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    40218 non-null  object 
 1   StockCode    40218 non-null  object 
 2   Description  40218 non-null  object 
 3   Quantity     40218 non-null  int64  
 4   InvoiceDate  40218 non-null  object 
 5   UnitPrice    40218 non-null  float64
 6   CustomerID   40218 non-null  float64
 7   Country      40218 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 2.8+ MB


In [6]:
data_cust.CustomerID.value_counts()

12748.0    695
17841.0    481
14606.0    421
15311.0    418
14911.0    377
          ... 
17720.0      1
16532.0      1
14206.0      1
14608.0      1
18141.0      1
Name: CustomerID, Length: 1204, dtype: int64

In [7]:
data_cust['TotalPurchase'] = data_cust['Quantity'] * data_cust['UnitPrice']
data_cust['InvoiceDate'] = pd.to_datetime(data_cust['InvoiceDate'].copy()) 

In [8]:
data_cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40218 entries, 0 to 65101
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   InvoiceNo      40218 non-null  object        
 1   StockCode      40218 non-null  object        
 2   Description    40218 non-null  object        
 3   Quantity       40218 non-null  int64         
 4   InvoiceDate    40218 non-null  datetime64[ns]
 5   UnitPrice      40218 non-null  float64       
 6   CustomerID     40218 non-null  float64       
 7   Country        40218 non-null  object        
 8   TotalPurchase  40218 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 3.1+ MB


In [9]:
data_cust_ = data_cust[['CustomerID', 'Quantity', 'InvoiceDate', 'UnitPrice', 'TotalPurchase']]
data_cust_.sample(20)

Unnamed: 0,CustomerID,Quantity,InvoiceDate,UnitPrice,TotalPurchase
64830,15379.0,10,2011-01-20 14:01:00,1.65,16.5
33440,16316.0,12,2010-12-16 14:26:00,1.95,23.4
28587,17293.0,24,2010-12-13 14:55:00,1.25,30.0
38291,12944.0,4,2010-12-20 10:41:00,3.75,15.0
39738,17961.0,1,2010-12-21 10:26:00,16.95,16.95
23175,16265.0,20,2010-12-10 10:21:00,1.25,25.0
25205,14062.0,48,2010-12-10 15:56:00,1.25,60.0
51598,17406.0,4,2011-01-10 14:57:00,2.95,11.8
43671,13126.0,8,2011-01-05 09:11:00,1.95,15.6
38644,13319.0,2,2010-12-20 13:31:00,7.95,15.9
