# 프로모션 효율 예측 (Random Forest)

Random Forest를 이용하여, 프로모션에 반응할 고객을 예측 <br>
고객 데이터와 거래 데이터를 통합 활용

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
## 한글 폰트 설정
import os
os.name

import os
if os.name == 'posix' :
    plt.rc('font', family='AppleGothic')    # 맥
else:
    plt.rc('font', family='Malgun Gothic')  # 윈도우

# 마이너스 폰트 깨지는 문제애 대한 대처
plt.rc('axes', unicode_minus=False)

# 레티나 설정을 해주면 글씨가 더 선명해짐
%config InlineBackend.figure_font = 'retina'

In [3]:
# 고객 데이터 불러오기
mem = pd.read_csv("member.csv")

print(mem.shape)
mem.head()

(64000, 6)


Unnamed: 0,id,recency,zip_code,is_referral,channel,conversion
0,906145,10,Surburban,0,Phone,0
1,184478,6,Rural,1,Web,0
2,394235,7,Surburban,1,Web,0
3,130152,9,Rural,1,Web,0
4,940352,2,Urban,0,Web,0


In [4]:
# 거래 데이터 불러오기
tran = pd.read_csv("transaction.csv")

print(tran.shape)
tran.head()

(196836, 3)


Unnamed: 0,id,num_item,total_amount
0,906145,5,34000
1,906145,1,27000
2,906145,4,33000
3,184478,4,29000
4,394235,4,33000


In [5]:
# 변수 확인
mem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           64000 non-null  int64 
 1   recency      64000 non-null  int64 
 2   zip_code     64000 non-null  object
 3   is_referral  64000 non-null  int64 
 4   channel      64000 non-null  object
 5   conversion   64000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.9+ MB


In [6]:
tran.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196836 entries, 0 to 196835
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   id            196836 non-null  int64
 1   num_item      196836 non-null  int64
 2   total_amount  196836 non-null  int64
dtypes: int64(3)
memory usage: 4.5 MB


In [7]:
# 수치형 데이터 확인
mem.describe()

Unnamed: 0,id,recency,is_referral,conversion
count,64000.0,64000.0,64000.0,64000.0
mean,550694.137797,5.763734,0.50225,0.146781
std,259105.689773,3.507592,0.499999,0.35389
min,100001.0,1.0,0.0,0.0
25%,326772.0,2.0,0.0,0.0
50%,551300.0,6.0,1.0,0.0
75%,774914.5,9.0,1.0,0.0
max,999997.0,12.0,1.0,1.0


In [8]:
tran.describe()

Unnamed: 0,id,num_item,total_amount
count,196836.0,196836.0,196836.0
mean,550557.552932,3.078365,21837.102969
std,259254.795613,1.478408,8218.005565
min,100001.0,1.0,8000.0
25%,326719.0,2.0,15000.0
50%,550918.0,3.0,22000.0
75%,774916.0,4.0,29000.0
max,999997.0,6.0,38000.0


Feature Engineering

In [9]:
# 개별 상품의 평균 금액
tran['avg_price'] = tran['total_amount'] / tran['num_item']

tran.head()

Unnamed: 0,id,num_item,total_amount,avg_price
0,906145,5,34000,6800.0
1,906145,1,27000,27000.0
2,906145,4,33000,8250.0
3,184478,4,29000,7250.0
4,394235,4,33000,8250.0


In [10]:
# id별 평균
tran_mean = tran.groupby('id').mean()
tran_mean

Unnamed: 0_level_0,num_item,total_amount,avg_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100001,3.500000,26000.000000,7500.000000
100008,5.000000,26000.000000,5200.000000
100032,2.666667,20666.666667,9366.666667
100036,3.000000,25800.000000,13273.333333
100070,3.250000,21250.000000,8537.500000
...,...,...,...
999932,5.000000,32000.000000,6400.000000
999981,2.000000,22750.000000,12875.000000
999990,3.000000,28000.000000,10388.888889
999995,2.000000,27000.000000,13500.000000


In [11]:
# id별 거래 횟수
tran_cnt = tran['id'].value_counts()
tran_cnt

id
691067    5
720147    5
422265    5
670720    5
154620    5
         ..
881780    1
154471    1
480462    1
126129    1
156423    1
Name: count, Length: 64000, dtype: int64

In [12]:
# id별 평균 + id별 거래 횟수 데이터 merge
tran_df = pd.concat([tran_mean, tran_cnt], axis=1)
tran_df

Unnamed: 0_level_0,num_item,total_amount,avg_price,count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100001,3.500000,26000.000000,7500.000000,2
100008,5.000000,26000.000000,5200.000000,1
100032,2.666667,20666.666667,9366.666667,3
100036,3.000000,25800.000000,13273.333333,5
100070,3.250000,21250.000000,8537.500000,4
...,...,...,...,...
999932,5.000000,32000.000000,6400.000000,1
999981,2.000000,22750.000000,12875.000000,4
999990,3.000000,28000.000000,10388.888889,3
999995,2.000000,27000.000000,13500.000000,1


최종 데이터

In [13]:
# 고객 데이터 + 거래 내역 데이터 결합
mem.set_index('id', inplace=True)

df = mem.join(tran_mean)

print(df.shape)
df.head()

(64000, 8)


Unnamed: 0_level_0,recency,zip_code,is_referral,channel,conversion,num_item,total_amount,avg_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
906145,10,Surburban,0,Phone,0,3.333333,31333.333333,14016.666667
184478,6,Rural,1,Web,0,4.0,29000.0,7250.0
394235,7,Surburban,1,Web,0,4.0,20500.0,5125.0
130152,9,Rural,1,Web,0,1.75,20750.0,14875.0
940352,2,Urban,0,Web,0,3.0,31000.0,10333.333333


In [14]:
df.isna().sum()

recency         0
zip_code        0
is_referral     0
channel         0
conversion      0
num_item        0
total_amount    0
avg_price       0
dtype: int64

One-Hot Encoding

In [15]:
df['zip_code'].unique()

array(['Surburban', 'Rural', 'Urban'], dtype=object)

In [16]:
df['channel'].unique()

array(['Phone', 'Web', 'Multichannel'], dtype=object)

In [17]:
df = pd.get_dummies(df, columns=['zip_code', 'channel'], drop_first=True).applymap(lambda x : int(x) if isinstance(x, bool) else x)
df

Unnamed: 0_level_0,recency,is_referral,conversion,num_item,total_amount,avg_price,zip_code_Surburban,zip_code_Urban,channel_Phone,channel_Web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
906145,10,0,0,3.333333,31333.333333,14016.666667,1,0,1,0
184478,6,1,0,4.000000,29000.000000,7250.000000,0,0,0,1
394235,7,1,0,4.000000,20500.000000,5125.000000,1,0,0,1
130152,9,1,0,1.750000,20750.000000,14875.000000,0,0,0,1
940352,2,0,0,3.000000,31000.000000,10333.333333,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
838295,10,0,0,3.500000,26000.000000,8012.500000,0,1,0,1
547316,5,1,0,1.800000,17800.000000,11300.000000,0,1,1,0
131575,6,1,0,4.000000,30500.000000,7833.333333,0,1,1,0
603659,1,1,0,3.200000,21600.000000,7583.333333,1,0,0,0


Train & Test Split

In [18]:
from sklearn.model_selection import train_test_split

X = df.drop('conversion', axis=1)
y = df['conversion']

X_train, X_test, y_train, y_test = train_test_split(X
                                                    , y
                                                    , test_size=0.3
                                                    , random_state=100)

모델 학습

In [19]:
from sklearn.ensemble import RandomForestClassifier

# 객체 생성
model = RandomForestClassifier(max_depth=10
                               , random_state=100)

# 모델 학습
model.fit(X_train, y_train)

# 예측
pred_y = model.predict(X_train)