In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc('font', family='NanumGothic')
matplotlib.rc('axes', unicode_minus=False)
%matplotlib inline

In [3]:
df = pd.read_csv('./data/new_sales4.csv', encoding='euc-kr')
df.columns

Index(['주문번호', '구매일', '배송시작일', '배송완료일', '구매시월령(수정)', '고객번호', '구매금액', '결제금액',
       '물품대분류', '상품명', '수량추정', '구매월', '할인율'],
      dtype='object')

In [4]:
recendancy = df.groupby('고객번호')['주문번호'].max()
frequency = df.groupby('고객번호')['구매일'].nunique()
monetary = df.groupby('고객번호')['결제금액'].sum()

In [20]:
df2 = pd.concat([recendancy, frequency, monetary], axis=1).reset_index()
df2.columns = ['customer','recendancy','frequency','monetary']

In [21]:
df2.head()

Unnamed: 0,customer,recendancy,frequency,monetary
0,201812310749735,202002203424450,2,54083
1,201812310749770,201902282095385,1,3800
2,201812310749771,201905132367003,1,47922
3,201812310749774,201907282649358,2,116532
4,201812310749780,201901301958105,1,19900


In [22]:
df2['recendancy'] = df2['recendancy'].apply(str).apply(lambda x:x[:4] + '-' + x[4:6] + '-' + x[6:8])
today = np.datetime64(df2.recendancy.max(), 'D')
df2['recendancy'] = df2.recendancy.apply(lambda x:today-np.datetime64(x, 'D')).dt.days
df2.head()

Unnamed: 0,customer,recendancy,frequency,monetary
0,201812310749735,168,2,54083
1,201812310749770,525,1,3800
2,201812310749771,451,1,47922
3,201812310749774,375,2,116532
4,201812310749780,554,1,19900


In [23]:
def dummy_recendancy(x, interval):
    for i in range(1, 6):
        if i-1 < x <= interval*i:
            return (i/5) # 0 to 1
    return (0)

df2.recendancy = df2.recendancy.apply(lambda x:dummy_recendancy(x, 90))

In [24]:
# 0 is no grade 5 is highest grade
df2['recendancy'].value_counts()

0.8    16880
0.0    14517
0.6    14422
1.0    13543
0.2    11605
0.4    10088
Name: recendancy, dtype: int64

In [25]:
def dummy_frequency(x, interval):
    for i in range(1, 6):
        if i-1 < x <= interval*i:
            return (i/5)
    return (1)

df2.frequency = df2.frequency.apply(lambda x:dummy_frequency(x, 2))

In [34]:
df2.frequency.value_counts()

0.2    62710
0.4     9895
0.6     4106
1.0     2368
0.8     1976
Name: frequency, dtype: int64

In [42]:
interval = [30000, 100000, 250000, 500000, 1000000]
def dummy_monetary(x, interval):
    for i in range(1, 6):
        if i == 1:
            if x <= interval[i-1]:
                return (i/5)
            else:
                continue        
        if interval[i-2] < x <= interval[i-1]:
            return (i/5)
    return (1)

df2.monetary = df2.monetary.apply(lambda x:dummy_monetary(x, interval))

In [44]:
df2.monetary.value_counts()

0.2    45541
0.4    23655
0.6     8813
0.8     2559
1.0      487
Name: monetary, dtype: int64

In [88]:
rfm = df2.recendancy + df2.frequency + (1.5*(df2.monetary))
df2['rfm'] = rfm

In [89]:
rfm.describe()

count    81055.000000
mean         1.289953
std          0.480121
min          0.500000
25%          1.000000
50%          1.300000
75%          1.600000
max          3.200000
dtype: float64

In [90]:
rfm_interval = [0.9, 1.35, 1.5, 1.9, 2.3]
def dummy_rfm(x, interval):
    for i in range(1, 6):
        if i == 1:
            if x <= interval[i-1]:
                return (i)
            else:
                continue
        if (interval[i-2] < x <= interval[i-1]):
            return (i)
    return (5)

df2['rfm'] = df2.rfm.apply(lambda x:dummy_rfm(x, rfm_interval))

In [118]:
df2['rfm'].value_counts().sort_index()/len(df2.rfm)

1    0.202023
2    0.357486
3    0.178607
4    0.188970
5    0.072913
Name: rfm, dtype: float64

In [None]:
pd.read_csv('./data/Member_data2.csv')

In [101]:
df2 = df2.set_index('customer')
df2['총구매금액'] = monetary
df2 = df2.reset_index()
df2 = df2.rename(columns = {'customer':'고객번호'})

In [112]:
df3 = pd.merge(pd.read_csv('./data/Member_data2.csv'), df2, on='고객번호')

In [113]:
df3.head()

Unnamed: 0,고객번호,성별,결혼유무,거주지역,연령,결제등록카드,유입경로,자녀여부,recendancy,frequency,monetary,rfm,총구매금액
0,201812310749735,여,기혼,인천,37.0,국민은행,인스타그램,Yes,0.4,0.2,0.4,2,54083
1,201812310749770,여,기혼,제주,33.0,국민은행,검색광고,Yes,0.0,0.2,0.2,1,3800
2,201812310749771,여,기혼,부산,32.0,하나은행,네이버블로그,Yes,0.0,0.2,0.4,1,47922
3,201812310749774,여,미혼,경남,34.0,기업은행,지인추천,No,1.0,0.2,0.6,5,116532
4,201812310749780,여,기혼,서울,32.0,신한은행,직접검색,Yes,0.0,0.2,0.2,1,19900


In [114]:
df3.to_csv('./data/Member_data3.csv', index=False, encoding='euc-kr')