In [3]:
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [5]:
dtypes = {
    'UniPrice':np.float32,
    'CustomerID':np.int32,
    'Quantity':np.int32
}
retail = pd.read_csv('./Data/onlineRetailCleansed.csv', dtype=dtypes)
retail['InvoiceDate']=pd.to_datetime(retail['InvoiceDate'], 
                                     infer_datetime_format=True)
retail.info()
retail.tail(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397884 entries, 0 to 397883
Data columns (total 10 columns):
Unnamed: 0       397884 non-null int64
InvoiceNo        397884 non-null int64
StockCode        397884 non-null object
Description      397884 non-null object
Quantity         397884 non-null int32
InvoiceDate      397884 non-null datetime64[ns]
UnitPrice        397884 non-null float64
CustomerID       397884 non-null int32
Country          397884 non-null object
CheckoutPrice    397884 non-null float64
dtypes: datetime64[ns](1), float64(2), int32(2), int64(2), object(3)
memory usage: 27.3+ MB


Unnamed: 0.1,Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CheckoutPrice
397882,541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,16.6
397883,541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680,France,14.85


## 우수고객 기준?
- 구매회수 기준
- 지불금액 기준

In [6]:
retail.groupby('CustomerID').count()['Quantity'].sort_values(ascending=False)

CustomerID
17841    7847
14911    5675
14096    5111
12748    4595
14606    2700
         ... 
15823       1
15802       1
15753       1
15668       1
12346       1
Name: Quantity, Length: 4338, dtype: int64

In [7]:
retail.groupby('CustomerID').sum()['CheckoutPrice'].sort_values(ascending=False)

CustomerID
14646    280206.02
18102    259657.30
17450    194550.79
16446    168472.50
14911    143825.06
           ...    
16878        13.30
17956        12.75
16454         6.90
14792         6.20
16738         3.75
Name: CheckoutPrice, Length: 4338, dtype: float64

### 사용자 retention 분석
- 월간 사용자 cohort를 바탕으로 월별 재구매율(retention)분석하기

In [9]:
# 사용자 기준으로 최초 구해한 월 연산하기
# monthstart 최초구매한 달
def get_month_as_datetime(date):
    return datetime(date.year, date.month, 1)
retail['Month']=retail['InvoiceDate'].apply(get_month_as_datetime)

In [35]:
month_group = retail.groupby('CustomerID')['Month']
retail['MonthStarted'] = month_group.transform(np.min)
retail.tail(2)

Unnamed: 0.1,Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CheckoutPrice,Month,MonthStarted,MonthPassed
397882,541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,16.6,2011-12-01,2011-08-01,4
397883,541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680,France,14.85,2011-12-01,2011-08-01,4


In [34]:
# 기준이 되는 월과 실제 구매월의 차이계산
# 각 구매가 최초구매로 부터 얼마의 월이 지났는지 연산
# 1년이 지났으면 12개월이 지남으로 변환
retail['MonthPassed'] = (retail['Month'].dt.year-retail['MonthStarted'].dt.year)*12\
                        +(retail['Month'].dt.month-retail['MonthStarted'].dt.month)
retail.tail(2)

Unnamed: 0.1,Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CheckoutPrice,Month,MonthStarted,MonthPassed
397882,541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,16.6,2011-12-01,2011-08-01,4
397883,541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680,France,14.85,2011-12-01,2011-08-01,4


In [19]:
# 기준월 지난월 기준으로 고객 카운팅
# 기간동안의 고객수 계산
def get_unique_no(x):
    return len(np.unique(x))
cohort_group = retail.groupby(['MonthStarted', 'MonthPassed'])
cohort_df    = cohort_group['CustomerID'].apply(get_unique_no).reset_index()
cohort_df.head()

Unnamed: 0,MonthStarted,MonthPassed,CustomerID
0,2010-12-01,0,885
1,2010-12-01,1,324
2,2010-12-01,2,286
3,2010-12-01,3,340
4,2010-12-01,4,321


In [23]:
# pivot 함수를 이용하여 index는 MonthStarted, columns dmf MonthPassed로 변경
# 첫번째 column 을 기준으로 100분위수 연산
cohort_df = cohort_df.pivot( index='MonthStarted', columns='MonthPassed' )

KeyError: "None of ['MonthStarted', 'MonthPassed'] are in the columns"

In [26]:
cohort_df.head(2)

Unnamed: 0_level_0,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID
MonthPassed,0,1,2,3,4,5,6,7,8,9,10,11,12
MonthStarted,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2010-12-01,885.0,324.0,286.0,340.0,321.0,352.0,321.0,309.0,313.0,350.0,331.0,445.0,235.0
2011-01-01,417.0,92.0,111.0,96.0,134.0,120.0,103.0,101.0,125.0,136.0,152.0,49.0,


In [32]:
customer_cohort = cohort_df.div( cohort_df.iloc[:,0], axis=0)*100
customer_cohort.head(2)

Unnamed: 0_level_0,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID
MonthPassed,0,1,2,3,4,5,6,7,8,9,10,11,12
MonthStarted,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2010-12-01,100.0,36.610169,32.316384,38.418079,36.271186,39.774011,36.271186,34.915254,35.367232,39.548023,37.40113,50.282486,26.553672
2011-01-01,100.0,22.06235,26.618705,23.021583,32.134293,28.776978,24.70024,24.220624,29.976019,32.613909,36.450839,11.7506,


In [31]:
customer_cohort = customer_cohort.round(decimals=1)
customer_cohort

Unnamed: 0_level_0,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID
MonthPassed,0,1,2,3,4,5,6,7,8,9,10,11,12
MonthStarted,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2010-12-01,100.0,36.6,32.3,38.4,36.3,39.8,36.3,34.9,35.4,39.6,37.4,50.3,26.6
2011-01-01,100.0,22.1,26.6,23.0,32.1,28.8,24.7,24.2,30.0,32.6,36.4,11.8,
2011-02-01,100.0,18.7,18.7,28.4,27.1,24.7,25.3,27.9,24.7,30.5,6.8,,
2011-03-01,100.0,15.0,25.2,19.9,22.4,16.8,26.8,23.0,27.9,8.6,,,
2011-04-01,100.0,21.3,20.3,21.0,19.7,22.7,21.7,26.0,7.3,,,,
2011-05-01,100.0,19.0,17.2,17.2,20.8,23.2,26.4,9.5,,,,,
2011-06-01,100.0,17.4,15.7,26.4,23.1,33.5,9.5,,,,,,
2011-07-01,100.0,18.1,20.7,22.3,27.1,11.2,,,,,,,
2011-08-01,100.0,20.7,24.8,24.3,12.4,,,,,,,,
2011-09-01,100.0,23.4,30.1,11.4,,,,,,,,,
