# 顧客流失偵測

根據顧客過去購買的時間間隔，偵測顧客可能流失的情形。如果離顧客上次購買的時間，其間隔已經超過某一個門檻，便提出顧客可能流失的警告。此一門檻是根據顧客每次購買的間隔時間計算得到。

1. 找出購買次數較高的顧客，做為分析的常客群。
2. 統計常客群裡每位顧客所有的購買時間間隔。
3. 計算每位顧客的門檻值。
4. 計算每位顧客目前與上次購買的時間間隔。
5. 比較門檻值與每位顧客目前與上次購買的時間間隔，找出可能流失的顧客

## 載入套件與讀取資料

### 載入套件

In [1]:
# 載入所需套件

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
'''
圖形中有中文字型的問題
參考
https://codertw.com/%E7%A8%8B%E5%BC%8F%E8%AA%9E%E8%A8%80/359974/
'''
from matplotlib.font_manager import FontProperties

han_font = FontProperties(fname=r"c:/windows/fonts/msjh.ttc", size=14) # 中文字形

In [3]:
'''
設計圖形呈現的外觀風格
'''
sns.set(style="whitegrid")

### 讀取資料

In [4]:
# 讀入資料檔
df = pd.read_excel('Online Retail.xlsx')

### 資料清理

In [5]:
# 去除CustomerID沒有資料的紀錄
df = df.dropna(subset=['CustomerID'])

In [6]:
from datetime import datetime

# 取出2010-12-09到2011-12-09一年之間的資料
df = df[df.InvoiceDate>=datetime(2010, 12, 9, 0, 0, 0)]

In [7]:
#取出購買紀錄(不包含取消紀錄)
df = df[df.Quantity>0] 

### 一年中的顧客購買次數

In [8]:
# 將同一個顧客的發票紀錄聚集成群後，計算不重複的發票編號數量，按照數量由大到小排序
CustomerData = df.groupby("CustomerID").agg({"InvoiceNo": "nunique"})\
.reset_index().sort_values("InvoiceNo", ascending=False)

CustomerData.head(10)

Unnamed: 0,CustomerID,InvoiceNo
322,12748.0,196
1859,14911.0,194
3960,17841.0,121
554,13089.0,94
1644,14606.0,90
2152,15311.0,88
475,12971.0,83
1672,14646.0,74
785,13408.0,60
2668,16029.0,60


### 找出常客群與他們的購買紀錄

In [9]:
# 一年中購買次數20次或以上的顧客
MajorCustomers = CustomerData.CustomerID[CustomerData.InvoiceNo>=20].values

In [10]:
# 常客的購物紀錄
mc_df = df.loc[df.CustomerID.isin(MajorCustomers), ]

### 統計每位常客的購買時間間隔

In [12]:
# 如果同一筆購物交易分在連續兩個時間內完成，則取最大者
mc_df = mc_df.groupby(["CustomerID", "InvoiceNo"])\
.agg({"InvoiceDate": "max"}).reset_index()

In [13]:
# 按照顧客及購物時間排列資料，使同一顧客的購物時間按先後排列
mc_df = mc_df.sort_values(["CustomerID", "InvoiceDate"], ascending=True)

mc_df.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate
0,12415.0,540267,2011-01-06 11:12:00
1,12415.0,540557,2011-01-10 09:58:00
2,12415.0,543989,2011-02-15 09:52:00
3,12415.0,545475,2011-03-03 10:59:00
4,12415.0,548661,2011-04-01 14:28:00
5,12415.0,553546,2011-05-17 15:42:00
6,12415.0,554037,2011-05-20 14:13:00
7,12415.0,556917,2011-06-15 13:37:00
8,12415.0,556918,2011-06-15 13:37:00
9,12415.0,559919,2011-07-13 15:30:00


In [14]:
# 計算每位顧客前後兩次購物的時間間隔
mc_freq = mc_df\
.assign(TimeDiff = mc_df.groupby("CustomerID").InvoiceDate.diff())

mc_freq.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,TimeDiff
0,12415.0,540267,2011-01-06 11:12:00,NaT
1,12415.0,540557,2011-01-10 09:58:00,3 days 22:46:00
2,12415.0,543989,2011-02-15 09:52:00,35 days 23:54:00
3,12415.0,545475,2011-03-03 10:59:00,16 days 01:07:00
4,12415.0,548661,2011-04-01 14:28:00,29 days 03:29:00
5,12415.0,553546,2011-05-17 15:42:00,46 days 01:14:00
6,12415.0,554037,2011-05-20 14:13:00,2 days 22:31:00
7,12415.0,556917,2011-06-15 13:37:00,25 days 23:24:00
8,12415.0,556918,2011-06-15 13:37:00,0 days 00:00:00
9,12415.0,559919,2011-07-13 15:30:00,28 days 01:53:00


In [15]:
# 將時間間隔改以分鐘計算
mc_freq.TimeDiff = mc_freq.TimeDiff/np.timedelta64(1,'m')

mc_freq.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,TimeDiff
0,12415.0,540267,2011-01-06 11:12:00,
1,12415.0,540557,2011-01-10 09:58:00,5686.0
2,12415.0,543989,2011-02-15 09:52:00,51834.0
3,12415.0,545475,2011-03-03 10:59:00,23107.0
4,12415.0,548661,2011-04-01 14:28:00,41969.0
5,12415.0,553546,2011-05-17 15:42:00,66314.0
6,12415.0,554037,2011-05-20 14:13:00,4231.0
7,12415.0,556917,2011-06-15 13:37:00,37404.0
8,12415.0,556918,2011-06-15 13:37:00,0.0
9,12415.0,559919,2011-07-13 15:30:00,40433.0


In [16]:
# 去除TimeDiff是NaN的列
mc_freq = mc_freq.dropna(subset=["TimeDiff"])

### 計算每位顧客的門檻值

In [17]:
# 按照顧客及購物時間間隔排列資料
mc_freq = mc_freq.sort_values(["CustomerID", "TimeDiff"], ascending=True)
mc_freq.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,TimeDiff
8,12415.0,556918,2011-06-15 13:37:00,0.0
10,12415.0,559920,2011-07-13 15:31:00,1.0
14,12415.0,565146,2011-09-01 13:51:00,1.0
16,12415.0,569650,2011-10-05 12:44:00,9.0
20,12415.0,576586,2011-11-15 14:22:00,230.0
18,12415.0,574138,2011-11-03 11:26:00,1403.0
6,12415.0,554037,2011-05-20 14:13:00,4231.0
1,12415.0,540557,2011-01-10 09:58:00,5686.0
11,12415.0,560491,2011-07-19 10:51:00,8360.0
19,12415.0,576394,2011-11-15 10:32:00,17226.0


In [18]:
# 每位顧客的購物時間間隔次數
mc_intervalcount = mc_freq.groupby("CustomerID").agg({"InvoiceDate": "count"}).reset_index()\
.rename(columns={"InvoiceDate": "IntervalCount"})

mc_intervalcount.head(10)

Unnamed: 0,CustomerID,IntervalCount
0,12415.0,20
1,12471.0,29
2,12569.0,31
3,12621.0,19
4,12681.0,20
5,12682.0,29
6,12720.0,23
7,12748.0,195
8,12841.0,22
9,12901.0,27


In [19]:
# 將mc_freq與mc_intervalcount合併，使mc_freq取得每位顧客的購物時間間隔次數
mc_freq = mc_freq.merge(mc_intervalcount, how="left", on=["CustomerID"])

mc_freq.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,TimeDiff,IntervalCount
0,12415.0,556918,2011-06-15 13:37:00,0.0,20
1,12415.0,559920,2011-07-13 15:31:00,1.0,20
2,12415.0,565146,2011-09-01 13:51:00,1.0,20
3,12415.0,569650,2011-10-05 12:44:00,9.0,20
4,12415.0,576586,2011-11-15 14:22:00,230.0,20
5,12415.0,574138,2011-11-03 11:26:00,1403.0,20
6,12415.0,554037,2011-05-20 14:13:00,4231.0,20
7,12415.0,540557,2011-01-10 09:58:00,5686.0,20
8,12415.0,560491,2011-07-19 10:51:00,8360.0,20
9,12415.0,576394,2011-11-15 10:32:00,17226.0,20


In [20]:
# 計算每一次購物時間間隔的機率 (1/次數)
mc_freq = mc_freq.assign(CumProb=1/mc_freq.IntervalCount)

mc_freq.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,TimeDiff,IntervalCount,CumProb
0,12415.0,556918,2011-06-15 13:37:00,0.0,20,0.05
1,12415.0,559920,2011-07-13 15:31:00,1.0,20,0.05
2,12415.0,565146,2011-09-01 13:51:00,1.0,20,0.05
3,12415.0,569650,2011-10-05 12:44:00,9.0,20,0.05
4,12415.0,576586,2011-11-15 14:22:00,230.0,20,0.05
5,12415.0,574138,2011-11-03 11:26:00,1403.0,20,0.05
6,12415.0,554037,2011-05-20 14:13:00,4231.0,20,0.05
7,12415.0,540557,2011-01-10 09:58:00,5686.0,20,0.05
8,12415.0,560491,2011-07-19 10:51:00,8360.0,20,0.05
9,12415.0,576394,2011-11-15 10:32:00,17226.0,20,0.05


In [21]:
# 計算累積機率
mc_freq.CumProb = mc_freq.groupby("CustomerID").CumProb.cumsum()

mc_freq.head(30)

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,TimeDiff,IntervalCount,CumProb
0,12415.0,556918,2011-06-15 13:37:00,0.0,20,0.05
1,12415.0,559920,2011-07-13 15:31:00,1.0,20,0.1
2,12415.0,565146,2011-09-01 13:51:00,1.0,20,0.15
3,12415.0,569650,2011-10-05 12:44:00,9.0,20,0.2
4,12415.0,576586,2011-11-15 14:22:00,230.0,20,0.25
5,12415.0,574138,2011-11-03 11:26:00,1403.0,20,0.3
6,12415.0,554037,2011-05-20 14:13:00,4231.0,20,0.35
7,12415.0,540557,2011-01-10 09:58:00,5686.0,20,0.4
8,12415.0,560491,2011-07-19 10:51:00,8360.0,20,0.45
9,12415.0,576394,2011-11-15 10:32:00,17226.0,20,0.5


In [23]:
# 根據累積機率以及對應的購物時間間隔，計算在90%的情形下，購物時間間隔應該在幾分鐘內
# 超過此時間，便假定顧客很有可能流失
def get_Interpolation(dat_fra):
    Threshold = np.interp(x=0.9, xp=dat_fra.CumProb, fp=dat_fra.TimeDiff)
    return pd.Series(Threshold)

# 計算每個顧客的可能流失時間間隔(門檻值)
mc_churn = mc_freq.groupby("CustomerID").apply(get_Interpolation)\
.reset_index().rename(columns={0: "MaxTimeThreshold"})

mc_churn.head(10)

Unnamed: 0,CustomerID,MaxTimeThreshold
0,12415.0,48884.0
1,12471.0,43041.8
2,12569.0,29931.7
3,12621.0,48216.1
4,12681.0,48882.0
5,12682.0,38880.1
6,12720.0,40209.2
7,12748.0,7071.0
8,12841.0,31445.4
9,12901.0,31105.8


### 計算每位顧客目前與上次購買的時間間隔

In [24]:
# 每位顧客最後一次購買時間
mc_todate = mc_df.groupby("CustomerID").agg({"InvoiceDate": "max"}).reset_index()
mc_todate.head(10)

Unnamed: 0,CustomerID,InvoiceDate
0,12415.0,2011-11-15 14:22:00
1,12471.0,2011-12-07 15:43:00
2,12569.0,2011-12-07 16:24:00
3,12621.0,2011-12-08 11:25:00
4,12681.0,2011-11-25 11:33:00
5,12682.0,2011-12-06 10:00:00
6,12720.0,2011-12-07 08:03:00
7,12748.0,2011-12-09 12:20:00
8,12841.0,2011-12-05 11:13:00
9,12901.0,2011-12-01 10:07:00


In [26]:
#從最後一次購買到2011-12-10的時間間隔
mc_todate = mc_todate.assign(ToDate=lambda x: (datetime(2011, 12, 10, 0, 0, 0) - x.InvoiceDate))
mc_todate.head(10)

Unnamed: 0,CustomerID,InvoiceDate,ToDate
0,12415.0,2011-11-15 14:22:00,24 days 09:38:00
1,12471.0,2011-12-07 15:43:00,2 days 08:17:00
2,12569.0,2011-12-07 16:24:00,2 days 07:36:00
3,12621.0,2011-12-08 11:25:00,1 days 12:35:00
4,12681.0,2011-11-25 11:33:00,14 days 12:27:00
5,12682.0,2011-12-06 10:00:00,3 days 14:00:00
6,12720.0,2011-12-07 08:03:00,2 days 15:57:00
7,12748.0,2011-12-09 12:20:00,0 days 11:40:00
8,12841.0,2011-12-05 11:13:00,4 days 12:47:00
9,12901.0,2011-12-01 10:07:00,8 days 13:53:00


In [27]:
# 從最後一次購買到2011-12-10的時間間隔(以分鐘計)
mc_todate.ToDate = mc_todate.ToDate / np.timedelta64(1,'m')

mc_todate.head(10)

Unnamed: 0,CustomerID,InvoiceDate,ToDate
0,12415.0,2011-11-15 14:22:00,35138.0
1,12471.0,2011-12-07 15:43:00,3377.0
2,12569.0,2011-12-07 16:24:00,3336.0
3,12621.0,2011-12-08 11:25:00,2195.0
4,12681.0,2011-11-25 11:33:00,20907.0
5,12682.0,2011-12-06 10:00:00,5160.0
6,12720.0,2011-12-07 08:03:00,3837.0
7,12748.0,2011-12-09 12:20:00,700.0
8,12841.0,2011-12-05 11:13:00,6527.0
9,12901.0,2011-12-01 10:07:00,12353.0


In [None]:
### 找出可能流失的顧客

In [28]:
# 從mc_churn取得每位顧客可能流失的時間間隔
mc_todate = mc_todate.merge(mc_churn, how="inner", on=["CustomerID"])

mc_todate.head(10)

Unnamed: 0,CustomerID,InvoiceDate,ToDate,MaxTimeThreshold
0,12415.0,2011-11-15 14:22:00,35138.0,48884.0
1,12471.0,2011-12-07 15:43:00,3377.0,43041.8
2,12569.0,2011-12-07 16:24:00,3336.0,29931.7
3,12621.0,2011-12-08 11:25:00,2195.0,48216.1
4,12681.0,2011-11-25 11:33:00,20907.0,48882.0
5,12682.0,2011-12-06 10:00:00,5160.0,38880.1
6,12720.0,2011-12-07 08:03:00,3837.0,40209.2
7,12748.0,2011-12-09 12:20:00,700.0,7071.0
8,12841.0,2011-12-05 11:13:00,6527.0,31445.4
9,12901.0,2011-12-01 10:07:00,12353.0,31105.8


In [None]:
# 如果顧客目前與上次購買的時間間隔大於門檻值，便是可能流失的顧客

In [29]:
mc_todate[mc_todate.ToDate>mc_todate.MaxTimeThreshold]

Unnamed: 0,CustomerID,InvoiceDate,ToDate,MaxTimeThreshold
61,16029.0,2011-11-01 10:27:00,55533.0,21637.3
66,16422.0,2011-11-22 14:00:00,25080.0,16771.0
72,16729.0,2011-11-01 14:29:00,55291.0,24562.0
94,17961.0,2011-11-18 16:55:00,30665.0,25119.0
