In [32]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='D2Coding')
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

from sklearn.cluster import KMeans

In [31]:
df = pd.read_csv('../data/Member_data4.csv', encoding='euc-kr')
df = df.drop(['결제등록카드', '유입경로', 'rfm'], axis=1)
df

Unnamed: 0,고객번호,성별,결혼유무,거주지역,연령,자녀여부,recendancy,frequency,monetary,총구매금액,첫주문일,가입일,첫주문까지일,물품대분류,chun,첫결제금액
0,201812310749735,여,기혼,인천,37.0,Yes,0.4,0.2,0.4,54083,2019-05-05,20181231,125,수유용품,1,16314
1,201812310749770,여,기혼,제주,33.0,Yes,0.0,0.2,0.2,3800,2019-02-28,20181231,59,티슈,0,3800
2,201812310749771,여,기혼,부산,32.0,Yes,0.0,0.2,0.4,47922,2019-05-13,20181231,133,3단계,0,47922
3,201812310749774,여,미혼,경남,34.0,No,1.0,0.2,0.6,116532,2019-05-09,20181231,129,3단계,1,39788
4,201812310749780,여,기혼,서울,32.0,Yes,0.0,0.2,0.2,19900,2019-01-30,20181231,30,이벤트,0,19900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89016,201912281058004,여,미혼,부산,30.0,No,0.6,0.2,0.2,9351,2019-12-28,20191228,0,이벤트,1,9351
89017,201912281058006,여,기혼,충남,34.0,Yes,0.2,0.2,0.2,8900,2019-12-28,20191228,0,맘큐,1,3000
89018,201912281058007,여,기혼,대전,29.0,Yes,0.6,0.2,0.4,52065,2019-12-28,20191228,0,아이_생활용품,1,52065
89019,201912281058011,여,기혼,충북,31.0,Yes,0.4,0.2,0.2,7524,2020-04-10,20191228,104,이벤트,0,7524


# RFM

In [None]:
fig, axes = plt.subplots(8, 8, figsize=(100, 100))
ax = axes.ravel()

R_range = [i * 5 for i in range(1, 5)]
F_range = [i * 5 for i in range(1, 5)]
M_range = [i * 1000 for i in range(1, 5)]

idx = 0

for R in R_range:
    df_grade = pd.DataFrame()
    df_grade['R'] = -df_rfm['R'] * R
    for F in F_range:
        df_grade['F'] = df_rfm['F'] * F
        for M in M_range:
            df_grade['M'] = df_rfm['M'] // M
            df_grade['grade'] = df_grade['R'] + df_grade['F'] + df_grade['M']
            
            ax[idx].set_title(f'R={R}, F={F}, M={M}')
            sns.kdeplot(data=df_grade, x='grade', ax=ax[idx])
            idx += 1
plt.show()
plt.tight_layout()

# sns.boxplot(data=df_grade, y='grade', ax=ax)
# df_grade[df_grade['grade'] >= 1000]

# KNN

In [32]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

df_y = df_scaled[:, 0]
df_x = df_scaled[:, 1:]

In [33]:
clus = KMeans(5)
clus.fit(df_x)
cluster_kmeans = [i + 1 for i in clus.labels_]
df['ClusterKmeans'] = cluster_kmeans
df.head()

Unnamed: 0,구매시월령(수정),구매금액,수량추정,할인율,물품대분류_1단계,물품대분류_2단계,물품대분류_3단계,물품대분류_4단계,물품대분류_5단계,물품대분류_6단계,...,물품대분류_아이_생활용품,물품대분류_어른_생활용품,물품대분류_음식,물품대분류_이벤트,물품대분류_장난감,물품대분류_주방용품,물품대분류_청결용품,물품대분류_티슈,물품대분류_화장품,ClusterKmeans
0,4.0,25800,2.0,0.37,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,14.0,51200,1.155756,0.26,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
2,66.0,59520,1.125142,0.19,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,5.0,50640,1.0,0.21,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,8.0,101280,3.421622,0.24,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [37]:
ds_summary_km = df.groupby('ClusterKmeans').agg(['min', 'mean', 'max'])
level0 = ds_summary_km.columns.get_level_values(0)
level1 = ds_summary_km.columns.get_level_values(1)
ds_summary_km.columns = level0 + '_' + level1
ds_summary_km.reset_index(inplace=True)
ds_summary_km.round(3)

Unnamed: 0,ClusterKmeans,구매시월령(수정)_min,구매시월령(수정)_mean,구매시월령(수정)_max,구매금액_min,구매금액_mean,구매금액_max,수량추정_min,수량추정_mean,수량추정_max,...,물품대분류_주방용품_max,물품대분류_청결용품_min,물품대분류_청결용품_mean,물품대분류_청결용품_max,물품대분류_티슈_min,물품대분류_티슈_mean,물품대분류_티슈_max,물품대분류_화장품_min,물품대분류_화장품_mean,물품대분류_화장품_max
0,1,-10.0,18.476,1439.0,500,28247.523,199000,1.0,1.42,38.571,...,0,0,0.06,1,0,0.197,1,0,0.055,1
1,2,-10.0,9.204,1214.0,500,9426.569,149900,1.0,1.049,10.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0
2,3,-10.0,12.049,566.0,500,61643.09,539000,1.0,1.228,10.0,...,0,0,0.0,1,0,0.0,1,0,0.0,1
3,4,-10.0,13.121,478.0,1600,62757.078,307500,1.0,1.182,6.429,...,0,0,0.0,0,0,0.0,0,0,0.0,0
4,5,-10.0,22.174,733.0,1600,15176.688,237000,1.0,1.341,10.636,...,1,0,0.0,0,0,0.0,0,0,0.0,0


Unnamed: 0,주문번호,구매일,배송시작일,배송완료일,구매시월령(수정),고객번호,구매금액,결제금액,물품대분류,상품명,수량추정,구매월,할인율,거주지역,출고기간,배송기간,구매요일
0,201905052342970,2019-05-06,2019-05-07,2019-05-11,4.0,201812310749735,25800,16314,수유용품,더블하트 SofTouch 모유실감 젖꼭지 M (2입),2.000000,201905,0.37,인천,1,4,0
1,202002203424450,2020-02-20,2020-02-21,2020-02-23,14.0,201812310749735,51200,37769,5단계,NEW 하기스 맥스드라이 팬티 5단계 남아 80매(팬티형),1.155756,202002,0.26,인천,1,2,3
2,201905132367003,2019-05-13,2019-05-14,2019-05-16,66.0,201812310749771,59520,47922,3단계,하기스 네이처메이드 밤부 3단계 여아 144매(밴드형),1.125142,201905,0.19,부산,1,2,0
3,201905092356247,2019-05-10,2019-05-11,2019-05-13,5.0,201812310749774,50640,39788,3단계,하기스 에어솔솔 썸머기저귀 3단계 공용 144매(밴드형),1.000000,201905,0.21,경남,1,2,4
4,201907282649358,2019-07-29,2019-07-30,2019-08-02,8.0,201812310749774,101280,76744,3단계,[2019년형] 하기스 에어솔솔 썸머기저귀 3단계 공용 144매(밴드형),3.421622,201907,0.24,경남,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166352,201912283245106,2019-12-29,2019-12-30,2020-01-01,0.0,201912281058006,3000,3000,이벤트,맘큐 허그박스,1.000000,201912,0.00,충남,1,2,6
166353,20200521172454-86563196093,2020-05-22,2020-05-23,2020-05-25,3.0,201912281058006,5900,5900,3단계,NEW 하기스 맥스드라이 3단계 공용 20매(밴드형/체험팩) (EA),1.000000,202005,0.00,충남,1,2,4
166354,202004103645638,2020-04-10,2020-04-11,2020-04-13,3.0,201912281058011,9500,7524,이벤트,더블하트 다기능 빨대 브러쉬,1.000000,202004,0.21,충북,1,2,4
166355,201912283245152,2019-12-29,2019-12-30,2020-01-02,-3.0,201912281058012,3000,3000,이벤트,맘큐 허그박스,1.000000,201912,0.00,서울,1,3,6
