In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./dataset/online_retail_cleaned.csv')

In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [4]:
df['Quarter'] = df['InvoiceDate'].dt.to_period('Q')

In [5]:
df['Revenue'] = df['Quantity'] * df['Price']

In [6]:
revenue_per_customer = df.groupby(['Customer ID', 'Quarter'])['Revenue'].sum().reset_index()


In [7]:
order_per_customer =  df.groupby(['Customer ID', 'Quarter'])['Invoice'].count().reset_index()

In [8]:
last_seen = df.groupby(['Customer ID', 'Quarter'])['InvoiceDate'].max().reset_index()

In [9]:
last_seen['Quarter'].dt.end_time

0       2009-12-31 23:59:59.999999999
1       2010-03-31 23:59:59.999999999
2       2010-06-30 23:59:59.999999999
3       2011-03-31 23:59:59.999999999
4       2010-12-31 23:59:59.999999999
                     ...             
18021   2010-06-30 23:59:59.999999999
18022   2010-09-30 23:59:59.999999999
18023   2010-12-31 23:59:59.999999999
18024   2011-06-30 23:59:59.999999999
18025   2011-12-31 23:59:59.999999999
Name: Quarter, Length: 18026, dtype: datetime64[ns]

In [10]:
last_seen['reference_date'] = last_seen['Quarter'].dt.end_time
last_seen['InvoiceDate'] = pd.to_datetime(last_seen['InvoiceDate'])
last_seen['Last Seen'] = (last_seen['reference_date'] - last_seen['InvoiceDate']).dt.days + 1

In [11]:
customer_eval = pd.merge(revenue_per_customer, order_per_customer, on=['Customer ID', 'Quarter'], how='left')
customer_eval = pd.merge(customer_eval, last_seen, on=['Customer ID', 'Quarter'], how='left')
customer_eval

Unnamed: 0,Customer ID,Quarter,Revenue,Invoice,InvoiceDate,reference_date,Last Seen
0,12346,2009Q4,113.50,5,2009-12-18 10:55:00,2009-12-31 23:59:59.999999999,14
1,12346,2010Q1,117.05,9,2010-03-02 13:08:00,2010-03-31 23:59:59.999999999,30
2,12346,2010Q2,142.31,19,2010-06-28 13:53:00,2010-06-30 23:59:59.999999999,3
3,12346,2011Q1,77183.60,1,2011-01-18 10:01:00,2011-03-31 23:59:59.999999999,73
4,12347,2010Q4,2035.11,102,2010-12-07 14:57:00,2010-12-31 23:59:59.999999999,25
...,...,...,...,...,...,...,...
18021,18287,2010Q2,1071.61,54,2010-05-17 11:55:00,2010-06-30 23:59:59.999999999,45
18022,18287,2010Q3,892.60,21,2010-09-21 12:19:00,2010-09-30 23:59:59.999999999,10
18023,18287,2010Q4,381.50,10,2010-11-22 11:51:00,2010-12-31 23:59:59.999999999,40
18024,18287,2011Q2,765.28,29,2011-05-22 10:39:00,2011-06-30 23:59:59.999999999,40


In [12]:
customer_eval.drop(['InvoiceDate', 'reference_date'], axis=1, inplace=True)

In [13]:
customer_eval

Unnamed: 0,Customer ID,Quarter,Revenue,Invoice,Last Seen
0,12346,2009Q4,113.50,5,14
1,12346,2010Q1,117.05,9,30
2,12346,2010Q2,142.31,19,3
3,12346,2011Q1,77183.60,1,73
4,12347,2010Q4,2035.11,102,25
...,...,...,...,...,...
18021,18287,2010Q2,1071.61,54,45
18022,18287,2010Q3,892.60,21,10
18023,18287,2010Q4,381.50,10,40
18024,18287,2011Q2,765.28,29,40


In [14]:
all_quarters = pd.MultiIndex.from_product([df['Customer ID'].unique(), df['Quarter'].unique()], names=['Customer ID', 'Quarter'])
all_quarters_df = pd.DataFrame(index=all_quarters).reset_index()

In [15]:
all_quarters_df

Unnamed: 0,Customer ID,Quarter
0,13085,2009Q4
1,13085,2010Q1
2,13085,2010Q2
3,13085,2010Q3
4,13085,2010Q4
...,...,...
52924,12713,2010Q4
52925,12713,2011Q1
52926,12713,2011Q2
52927,12713,2011Q3


In [16]:
customer_eval = pd.merge(all_quarters_df, customer_eval, on=['Customer ID', 'Quarter'], how='left')

In [17]:
customer_eval

Unnamed: 0,Customer ID,Quarter,Revenue,Invoice,Last Seen
0,13085,2009Q4,1096.80,26.0,29.0
1,13085,2010Q1,920.40,36.0,62.0
2,13085,2010Q2,,,
3,13085,2010Q3,,,
4,13085,2010Q4,,,
...,...,...,...,...,...
52924,12713,2010Q4,,,
52925,12713,2011Q1,,,
52926,12713,2011Q2,,,
52927,12713,2011Q3,,,


In [18]:
def assign_score(row, column):
    non_zero_revenues = customer_eval[customer_eval[column] > 0][column]
    percentiles = non_zero_revenues.quantile([0.8, 0.6, 0.4, 0.2]).to_dict()
    if not row[column] > 0:
        return 0
    elif row[column] > percentiles[0.8]:
        return 5
    elif row[column] > percentiles[0.6]:
        return 4
    elif row[column] > percentiles[0.4]:
        return 3
    elif row[column] > percentiles[0.2]:
        return 2
    else:
        return 1


In [19]:
customer_eval['Revenue Score'] = customer_eval.apply(assign_score, args=('Revenue',), axis=1)
customer_eval['Freq Score'] = customer_eval.apply(assign_score, args=('Invoice',), axis=1)
customer_eval['Recency Score'] = customer_eval.apply(assign_score, args=('Last Seen',), axis=1)

In [20]:
customer_eval

Unnamed: 0,Customer ID,Quarter,Revenue,Invoice,Last Seen,Revenue Score,Freq Score,Recency Score
0,13085,2009Q4,1096.80,26.0,29.0,5,3,3
1,13085,2010Q1,920.40,36.0,62.0,4,4,5
2,13085,2010Q2,,,,0,0,0
3,13085,2010Q3,,,,0,0,0
4,13085,2010Q4,,,,0,0,0
...,...,...,...,...,...,...,...,...
52924,12713,2010Q4,,,,0,0,0
52925,12713,2011Q1,,,,0,0,0
52926,12713,2011Q2,,,,0,0,0
52927,12713,2011Q3,,,,0,0,0


In [21]:
customer_eval['RFM Score'] = customer_eval['Revenue Score'].astype(str) + \
                            customer_eval['Freq Score'].astype(str) + \
                            customer_eval['Recency Score'].astype(str)

In [22]:
customer_eval['RFM Score'] = customer_eval['RFM Score'].astype(int)

In [23]:
customer_eval

Unnamed: 0,Customer ID,Quarter,Revenue,Invoice,Last Seen,Revenue Score,Freq Score,Recency Score,RFM Score
0,13085,2009Q4,1096.80,26.0,29.0,5,3,3,533
1,13085,2010Q1,920.40,36.0,62.0,4,4,5,445
2,13085,2010Q2,,,,0,0,0,0
3,13085,2010Q3,,,,0,0,0,0
4,13085,2010Q4,,,,0,0,0,0
...,...,...,...,...,...,...,...,...,...
52924,12713,2010Q4,,,,0,0,0,0
52925,12713,2011Q1,,,,0,0,0,0
52926,12713,2011Q2,,,,0,0,0,0
52927,12713,2011Q3,,,,0,0,0,0


In [24]:
# def RFM_to_segment(row):
#     if row['RFM Score'] == 0:
#         row['Segment'] = 'Lost'
#     elif row['RFM Score'] in [555, 554, 544, 545, 454, 455, 445]:
#         row['Segment'] = 'Champion'
#     elif row['RFM Score'] in [543, 444, 435, 355, 354, 345, 344, 335]:
#         row['Segment'] = 'Loyal Customer'
#     elif row['RFM Score'] in [553, 551, 552, 541, 542, 533, 532, 531, 
#                                 452, 451, 442, 441, 431, 453, 433, 432, 
#                                 423, 353, 352, 351, 342, 341, 333, 323]:
#         row['Segment'] = 'Potential Loyalist'
#     elif row['RFM Score'] in [512, 511, 422, 421, 412, 411, 311]:
#         row['Segment'] = 'New Customer'
#     elif row['RFM Score'] in [525, 524, 523, 522, 521, 515, 514, 513, 
#                                 425,424, 413,414,415, 315, 314, 313]:
#         row['Segment'] = 'Promising'
#     elif row['RFM Score'] in [535, 534, 443, 434, 343, 334, 325, 324]:
#         row['Segment'] = 'Need Attention'
#     elif row['RFM Score'] in [155, 154, 144, 214, 215, 115, 114, 113]:
#         row['Segment'] = 'Cannot Lose Them'
#     elif row['RFM Score'] in [331, 321, 312, 221, 213]:
#         row['Segment'] = 'About To Sleep'
#     elif row['RFM Score'] in [255, 254, 245, 244, 253, 252, 243, 242, 
#                                 235, 234, 225, 224, 153, 152, 145, 143, 
#                                 142, 135, 134, 133, 125, 124]:
#         row['Segment'] = 'At Risk'
#     elif row['RFM Score'] in [332, 322, 231, 241, 251, 233, 232, 223, 222, 
#                                 132, 123, 122, 212, 211]:
#         row['Segment'] = 'Hibernating'
#     elif row['RFM Score'] in [111, 112, 121, 131, 141, 151]:
#         row['Segment'] = 'Losing'
#     else:
#         print(row['Customer ID'], row['RFM Score'])

In [25]:
# customer_eval[customer_eval['Customer ID'] == 14103]

In [26]:
# customer_eval.apply(RFM_to_segment, axis=1)

14103 14 \
14827 14 \
13256 13


In [31]:
def RFM_to_segment(row):
    if row['RFM Score'] == 0:
        return 'Lost'
    elif row['RFM Score'] in [555, 554, 544, 545, 454, 455, 445]:
        return 'Champion'
    elif row['RFM Score'] in [543, 444, 435, 355, 354, 345, 344, 335]:
        return 'Loyal Customer'
    elif row['RFM Score'] in [553, 551, 552, 541, 542, 533, 532, 531, 
                                452, 451, 442, 441, 431, 453, 433, 432, 
                                423, 353, 352, 351, 342, 341, 333, 323]:
        return 'Potential Loyalist'
    elif row['RFM Score'] in [512, 511, 422, 421, 412, 411, 311]:
        return 'New Customer'
    elif row['RFM Score'] in [525, 524, 523, 522, 521, 515, 514, 513, 
                                425,424, 413,414,415, 315, 314, 313]:
        return 'Promising'
    elif row['RFM Score'] in [535, 534, 443, 434, 343, 334, 325, 324]:
        return 'Need Attention'
    elif row['RFM Score'] in [155, 154, 144, 214, 215, 115, 114, 113]:
        return 'Cannot Lose Them'
    elif row['RFM Score'] in [331, 321, 312, 221, 213]:
        return 'About To Sleep'
    elif row['RFM Score'] in [255, 254, 245, 244, 253, 252, 243, 242, 
                                235, 234, 225, 224, 153, 152, 145, 143, 
                                142, 135, 134, 133, 125, 124]:
        return 'At Risk'
    elif row['RFM Score'] in [332, 322, 231, 241, 251, 233, 232, 223, 222, 
                                132, 123, 122, 212, 211]:
        return 'Hibernating'
    elif row['RFM Score'] in [111, 112, 121, 131, 141, 151]:
        return 'Losing'
    else:
        return 'Lost'

In [32]:
customer_eval['Segment'] = customer_eval.apply(RFM_to_segment, axis=1)

In [36]:
customer_eval['Next Quarter'] = customer_eval['Quarter'].shift(-1)

In [40]:
customer_eval['Next RFM'] = customer_eval['RFM Score'].shift(-1)

In [41]:
customer_eval['Next Segment'] = customer_eval['Segment'].shift(-1)

In [42]:
customer_eval = customer_eval[customer_eval['Quarter'] != '2011Q4']

In [43]:
customer_eval.to_csv('./dataset/customer_eval.csv')

In [44]:
customer_eval

Unnamed: 0,Customer ID,Quarter,Revenue,Invoice,Last Seen,Revenue Score,Freq Score,Recency Score,RFM Score,Segment,Next Quarter,Next RFM,Next Segment
0,13085,2009Q4,1096.8,26.0,29.0,5,3,3,533,Potential Loyalist,2010Q1,445.0,Champion
1,13085,2010Q1,920.4,36.0,62.0,4,4,5,445,Champion,2010Q2,0.0,Lost
2,13085,2010Q2,,,,0,0,0,0,Lost,2010Q3,0.0,Lost
3,13085,2010Q3,,,,0,0,0,0,Lost,2010Q4,0.0,Lost
4,13085,2010Q4,,,,0,0,0,0,Lost,2011Q1,224.0,At Risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52923,12713,2010Q3,,,,0,0,0,0,Lost,2010Q4,0.0,Lost
52924,12713,2010Q4,,,,0,0,0,0,Lost,2011Q1,0.0,Lost
52925,12713,2011Q1,,,,0,0,0,0,Lost,2011Q2,0.0,Lost
52926,12713,2011Q2,,,,0,0,0,0,Lost,2011Q3,0.0,Lost


In [45]:
customer_eval['Segment'].value_counts()

Segment
Lost                  31585
Potential Loyalist     4792
At Risk                1954
Cannot Lose Them       1738
Hibernating            1676
Need Attention         1077
Loyal Customer          942
About To Sleep          830
Losing                  791
Promising               671
Champion                617
New Customer            375
Name: count, dtype: int64