In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
# 清洗数据表
df = pd.read_csv('CDNOW_master.txt', sep='\s+', header=None)
df.columns = ['id','date','quantity','amount']
df['date'] = pd.to_datetime(df['date'].astype(str))

In [3]:
df.head(10)

Unnamed: 0,id,date,quantity,amount
0,1,1997-01-01,1,11.77
1,2,1997-01-12,1,12.0
2,2,1997-01-12,5,77.0
3,3,1997-01-02,2,20.76
4,3,1997-03-30,2,20.76
5,3,1997-04-02,2,19.54
6,3,1997-11-15,5,57.45
7,3,1997-11-25,4,20.96
8,3,1998-05-28,1,16.99
9,4,1997-01-01,2,29.33


In [4]:
# 消费金额
ms = df.groupby('id')['amount'].sum()
ms.name = 'ms_amount'
ms.head()

id
1     11.77
2     89.00
3    156.46
4    100.50
5    385.61
Name: ms_amount, dtype: float64

In [5]:
# 消费次数
fs = df.groupby('id')['amount'].count()
fs.name = 'fs_times'
fs.head()

id
1     1
2     2
3     6
4     4
5    11
Name: fs_times, dtype: int64

In [6]:
# 最后一次消费时间距截止时间的时间间隔
rs = df['date'].max() - df.groupby('id')['date'].max() 
rs = rs.dt.days
rs.name = 'rs_delta'
rs.head()

id
1    545
2    534
3     33
4    200
5    178
Name: rs_delta, dtype: int64

In [7]:
# 合并ms,fs,rs
df = pd.concat([ms,fs,rs], axis=1)
df.head()

Unnamed: 0_level_0,ms_amount,fs_times,rs_delta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11.77,1,545
2,89.0,2,534
3,156.46,6,33
4,100.5,4,200
5,385.61,11,178


In [16]:
# 数据预处理：数据标准化
from sklearn import preprocessing
StandardScaler = preprocessing.StandardScaler().fit(df)
date = StandardScaler.transform(df)
data[0:5,:]

array([[-0.39145938, -0.41284215,  0.98107709],
       [-0.07089664, -0.20171391,  0.92037314],
       [ 0.20911322,  0.64279906, -1.84441563],
       [-0.02316297,  0.22054257, -0.92281937],
       [ 1.1602585 ,  1.69844027, -1.04422727]])

In [9]:
# 构建聚类模型
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters = 3,n_jobs=4)
fit_kmeans = kmeans_model.fit(data)

In [10]:
# 贴聚类标签
s = pd.Series(fit_kmeans.labels_, index=df.index)
s.name = 'label'
df1 = df.join(s)
df1.head()

Unnamed: 0_level_0,ms_amount,fs_times,rs_delta,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,11.77,1,545,0
2,89.0,2,534,0
3,156.46,6,33,2
4,100.5,4,200,2
5,385.61,11,178,2


In [11]:
# 对聚类标签计数
s1 = df1['label'].value_counts()
s1.name = 'count'
s1

0    16237
2     7183
1      150
Name: count, dtype: int64

In [12]:
# 整合为聚类中心表
df2 = pd.DataFrame(fit_kmeans.cluster_centers_, index=s1.index, columns=df.columns).join(s1)
df2

Unnamed: 0,ms_amount,fs_times,rs_delta,count
0,-0.260981,-0.328882,0.62308,16237
2,7.807827,7.056875,-1.774146,7183
1,0.426893,0.596065,-1.371408,150


In [13]:
# 对聚类中心表排名
df2.rank(ascending=False)

Unnamed: 0,ms_amount,fs_times,rs_delta,count
0,3.0,3.0,1.0,1.0
2,1.0,1.0,3.0,2.0
1,2.0,2.0,2.0,3.0


In [15]:
# 雷达图展示聚类结果
from pyecharts import Radar

# 配置雷达图标签、值及其他
v1 = [list(df2.iloc[0,:3])]
v2 = [list(df2.iloc[1,:3])]
v3 = [list(df2.iloc[2,:3])]
c_schema= [{"name": "M", "max": 12, "min": -1},
           {"name": "F", "max": 10, "min": -1},
           {"name": "R", "max": 1, "min": -2},]

radar = Radar()
radar.config(c_schema=c_schema, shape='circle')
radar.add('客户群1', v1, item_color="#ff0000", symbol=None)
radar.add('客户群2', v2, item_color="#477725", symbol=None)
radar.add('客户群3', v3, item_color="#66ff00", symbol=None)

# 显示雷达图
radar