In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv(r'G:\traindata\RFMtest\Retail_Data_Transactions.csv')
df.head()

Unnamed: 0,customer_id,trans_date,tran_amount
0,CS5295,11-Feb-13,35
1,CS4768,15-Mar-15,39
2,CS2122,26-Feb-13,52
3,CS1217,16-Nov-11,99
4,CS1850,20-Nov-13,78


In [3]:
df['trans_date'] = pd.to_datetime(df['trans_date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   customer_id  125000 non-null  object        
 1   trans_date   125000 non-null  datetime64[ns]
 2   tran_amount  125000 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 2.9+ MB


In [4]:
R_today = dt.datetime(2015,3,17)

In [5]:
df['R_diff'] = (R_today-df['trans_date']).dt.days
df.head()

Unnamed: 0,customer_id,trans_date,tran_amount,R_diff
0,CS5295,2013-02-11,35,764
1,CS4768,2015-03-15,39,2
2,CS2122,2013-02-26,52,749
3,CS1217,2011-11-16,99,1217
4,CS1850,2013-11-20,78,482


In [6]:
R = df.groupby(by = ['customer_id'])['R_diff']
R.head()

0          764
1            2
2          749
3         1217
4          482
          ... 
124305     876
124326     793
124395      66
124602     252
124878     992
Name: R_diff, Length: 34439, dtype: int64

In [7]:
R = R.agg([('R_diff','min')])
R.head()

Unnamed: 0_level_0,R_diff
customer_id,Unnamed: 1_level_1
CS1112,62
CS1113,36
CS1114,33
CS1115,12
CS1116,204


In [8]:
R.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6889 entries, CS1112 to CS9000
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   R_diff  6889 non-null   int64
dtypes: int64(1)
memory usage: 107.6+ KB


In [9]:
F = df.groupby(by = ['customer_id'])['customer_id'].agg([('F_fre','count')])
M = df.groupby(by = ['customer_id'])['tran_amount'].agg([('M_sum',sum)])
RFM = R.join(F).join(M)
RFM.head()

Unnamed: 0_level_0,R_diff,F_fre,M_sum
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CS1112,62,15,1012
CS1113,36,20,1490
CS1114,33,19,1432
CS1115,12,22,1659
CS1116,204,13,857


In [10]:
RFM.describe()

Unnamed: 0,R_diff,F_fre,M_sum
count,6889.0,6889.0,6889.0
mean,81.538249,18.144869,1179.269705
std,85.382526,5.193014,465.832609
min,1.0,4.0,149.0
25%,23.0,14.0,781.0
50%,54.0,18.0,1227.0
75%,112.0,22.0,1520.0
max,858.0,39.0,2933.0


In [11]:
rfm_n = RFM.loc[:,['R_diff','F_fre','M_sum']]
clf = KMeans(n_clusters=8,random_state=0)
clf.fit(rfm_n)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [12]:
rfm_n['label'] = clf.labels_
rfm_n.head()

Unnamed: 0_level_0,R_diff,F_fre,M_sum,label
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CS1112,62,15,1012,0
CS1113,36,20,1490,5
CS1114,33,19,1432,5
CS1115,12,22,1659,1
CS1116,204,13,857,7


In [13]:
clf.cluster_centers_

array([[  85.31256713,   15.64661654, 1007.03759398],
       [  54.93422405,   24.00719424, 1712.68448099],
       [  78.49240506,   10.83037975,  456.97848101],
       [  78.94240838,   18.04861631, 1246.84068811],
       [  50.7475    ,   28.3       , 2040.835     ],
       [  63.28847641,   20.93348801, 1469.82830626],
       [ 354.61711712,   10.33333333,  473.42342342],
       [  85.29056204,   14.53446448,  710.53446448]])

In [14]:
r = pd.DataFrame(clf.cluster_centers_,columns = ['R','F','M'])
r

Unnamed: 0,R,F,M
0,85.312567,15.646617,1007.037594
1,54.934224,24.007194,1712.684481
2,78.492405,10.83038,456.978481
3,78.942408,18.048616,1246.840688
4,50.7475,28.3,2040.835
5,63.288476,20.933488,1469.828306
6,354.617117,10.333333,473.423423
7,85.290562,14.534464,710.534464


In [15]:
rmd = r['R'].median()
fmd = r['F'].median()
mmd = r['M'].median()
rmd,fmd,mmd

(78.71740672012726, 16.8476164232571, 1126.939141046333)

In [16]:
cluster=[]
for i in range(len(r)):
    if r.iloc[i,0] > rmd and r.iloc[i,1] >fmd and r.iloc[i,2] >mmd:
        cluster.append('高价值客户')
    elif r.iloc[i,0] < rmd and r.iloc[i,1] > fmd and r.iloc[i,2] >mmd:
        cluster.append('重点保持客户')
    elif r.iloc[i,0] > rmd and r.iloc[i,1] < fmd and r.iloc[i,2] >mmd:
        cluster.append('重点发展客户')
    elif r.iloc[i,0] < rmd and r.iloc[i,1] < fmd and r.iloc[i,2] > mmd:
        cluster.append('重点挽留客户')
    elif r.iloc[i,0] > rmd and r.iloc[i,1] > fmd and r.iloc[i,2] < mmd:
        cluster.append('一般价值客户')
    elif r.iloc[i,0] < rmd and r.iloc[i,1] > fmd and r.iloc[i,2] < mmd:
        cluster.append('一般保持客户')
    elif r.iloc[i,0] > rmd and r.iloc[i,1] < fmd and r.iloc[i,2] < mmd:
        cluster.append('一般发展客户')
    else:
        cluster.append('潜在客户')

In [17]:
r['客户分类']=cluster
r
s = r.reset_index()
s

Unnamed: 0,index,R,F,M,客户分类
0,0,85.312567,15.646617,1007.037594,一般发展客户
1,1,54.934224,24.007194,1712.684481,重点保持客户
2,2,78.492405,10.83038,456.978481,潜在客户
3,3,78.942408,18.048616,1246.840688,高价值客户
4,4,50.7475,28.3,2040.835,重点保持客户
5,5,63.288476,20.933488,1469.828306,重点保持客户
6,6,354.617117,10.333333,473.423423,一般发展客户
7,7,85.290562,14.534464,710.534464,一般发展客户


In [18]:
result = pd.merge(rfm_n,s[['index','客户分类']],how='inner',left_on='label',right_on='index')
result.sort_index()
result.head(20)

Unnamed: 0,R_diff,F_fre,M_sum,label,index,客户分类
0,62,15,1012,0,0,一般发展客户
1,3,15,1011,0,0,一般发展客户
2,26,12,885,0,0,一般发展客户
3,21,14,998,0,0,一般发展客户
4,28,16,1074,0,0,一般发展客户
5,8,16,1117,0,0,一般发展客户
6,5,15,1004,0,0,一般发展客户
7,90,14,1075,0,0,一般发展客户
8,27,16,1040,0,0,一般发展客户
9,112,14,916,0,0,一般发展客户


In [19]:
result.tail(20)
result

Unnamed: 0,R_diff,F_fre,M_sum,label,index,客户分类
0,62,15,1012,0,0,一般发展客户
1,3,15,1011,0,0,一般发展客户
2,26,12,885,0,0,一般发展客户
3,21,14,998,0,0,一般发展客户
4,28,16,1074,0,0,一般发展客户
...,...,...,...,...,...,...
6884,331,9,453,6,6,一般发展客户
6885,227,12,502,6,6,一般发展客户
6886,249,14,552,6,6,一般发展客户
6887,262,14,543,6,6,一般发展客户


In [22]:
person = result.groupby(by = ['客户分类'])['客户分类'].agg([('客户分类','count')])
person.head(9)

Unnamed: 0_level_0,客户分类
客户分类,Unnamed: 1_level_1
一般发展客户,2096
潜在客户,790
重点保持客户,2660
高价值客户,1343
