In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree



### **Step 1 : 建模**
##### M1 = 9447 ~ 9477 / M2 = 9477 ~ 9507 / M3 = 9507 ~ 9537 / M4 = 9537 ~ 9567

#### **1. 匯入所有商品30~60(m3)有無購買資料, 基本屬性資料, 網址資料**

In [3]:
fx_m3 = pd.read_csv('./data/fx_m3.csv')
cc_m3 = pd.read_csv('./data/cc_m3.csv')
ln_m3 = pd.read_csv('./data/ln_m3.csv')
wm_m3 = pd.read_csv('./data/wm_m3.csv')
cif_use = pd.read_csv('./data/cif_use.csv')
web_train = pd.read_csv('./data/web_train.csv')
# cif_use.drop(['CUST_START_DT'], axis = 1, inplace = True)

In [4]:
cif_use.shape

(187679, 7)

#### **2. 處理信用卡資料**

In [5]:
tb_cc = pd.read_csv('./data/TBN_CC_APPLY.csv')
tb_cc.rename(columns={'TXN_DT':'CC_DT'}, inplace = True)

In [6]:
# 計算30天以上,120天以內購買次數(m1~m3, 9447~9537)
cc_90 = tb_cc.loc[tb_cc['CC_DT'] <= 9537,['CUST_NO', 'CC_DT']]
cc_90.reset_index(drop=True, inplace=True)
cc_90['CC_DT'] = 1
cc_90 = cc_90.groupby('CUST_NO').sum().reset_index(level=0)
cc_90.columns = ['CUST_NO','CC_count']

In [7]:
# 篩出0~30天(m4)有無購買(Y)
tb_cc['CC_Y'] = [1 if i >= 9537 else 0 for i in tb_cc['CC_DT']]
cc_y = tb_cc.loc[tb_cc['CC_Y'] == 1,['CUST_NO', 'CC_Y']].drop_duplicates(subset = 'CUST_NO')

In [8]:
# 合併cc所需資料
cc_train = cc_90.merge(cc_y, on = 'CUST_NO', how = 'left')
cc_train = cc_train.fillna(0)

#### **3. 以基本屬性為底,合併網址、信用卡、其他商品m3有無購買之資料**

In [13]:
cc_forecast = cif_use.merge(web_train, on = 'CUST_NO', how = 'left').merge(cc_train, on = 'CUST_NO', how = 'left').merge(cc_m3, on = 'CUST_NO', how = 'left')\
.merge(ln_m3, on = 'CUST_NO', how = 'left').merge(wm_m3, on = 'CUST_NO', how = 'left').merge(fx_m3, on = 'CUST_NO', how = 'left')

In [15]:
cc_forecast = cc_forecast.fillna(0)
cc_forecast_list = [i for i in cc_forecast.columns if i !='CUST_NO' and i!='GENDER_CODE' and i != 'AGE']
cc_forecast[cc_forecast_list] = cc_forecast[cc_forecast_list].apply(lambda x:x.astype(int))

In [16]:
cc_forecast['CC_M3'].value_counts()

0    181549
1      6130
Name: CC_M3, dtype: int64

#### 4. **決策樹**

In [17]:
X = cc_forecast.drop(['CUST_NO', 'CC_Y'], axis = 1)
Y = cc_forecast['CC_Y']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30,random_state=0)

In [18]:
print('X_train :',X_train.shape)
print('Y_train', Y_train.shape)
print('X_test :', X_test.shape)
print('Y_test', Y_test.shape)

X_train : (131375, 18)
Y_train (131375,)
X_test : (56304, 18)
Y_test (56304,)


In [19]:
cc_tree = DecisionTreeClassifier(criterion = "entropy",
                                 max_depth=3, min_samples_leaf=5)
cc_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
cc_predict = cc_tree.predict(X_test)

In [21]:
pd.Series(cc_predict.tolist()).value_counts()

0    56304
dtype: int64

In [22]:
# print(classification_report(Y_test,fx_predict))
print("Accuracy is ", accuracy_score(Y_test, cc_predict)*100)
# print(confusion_matrix(y_test,predictions))

Accuracy is  99.52401250355216


### **Step 2 : Load 預測資料集**

In [23]:
Y_zero  = pd.read_csv('./data/TBN_Y_ZERO.csv')
# Y_zero  = pd.read_csv('./data/TBN_Y_ZERO.csv', usecols = ['CUST_NO', 'FX_IND'])

#### **1. 匯入所有商品0~30(m4)有無購買,網址資料(m3,m4)**

In [24]:
fx_m4 = pd.read_csv('./data/fx_m4.csv')
cc_m4 = pd.read_csv('./data/cc_m4.csv')
ln_m4 = pd.read_csv('./data/ln_m4.csv')
wm_m4 = pd.read_csv('./data/wm_m4.csv')
web_test = pd.read_csv('./data/web_test.csv')

In [25]:
web_test.shape

(106655, 8)

#### **2. 處理信用卡資料**

In [26]:
# 計算0~90天以內購買次數(m2~m4, 9477~9567)
cc_test = tb_cc.loc[tb_cc['CC_DT'] > 9477,['CUST_NO', 'CC_DT']]
cc_test.reset_index(drop=True, inplace=True)
cc_test['CC_DT'] = 1
cc_test = cc_test.groupby('CUST_NO').sum().reset_index(level=0)
cc_test.columns = ['CUST_NO','CC_count']

#### **3. 以Y為底，合併基本屬性,網址、外匯、其他商品m3有無購買之資料**

In [27]:
Y_cc = Y_zero[['CUST_NO']].merge(cif_use, on = 'CUST_NO', how = 'left').merge(web_test, on = 'CUST_NO', how = 'left').merge(cc_test, on = 'CUST_NO', how = 'left')\
.merge(cc_m4, on = 'CUST_NO', how = 'left').merge(wm_m4, on = 'CUST_NO', how = 'left').merge(fx_m4, on = 'CUST_NO', how = 'left').merge(ln_m4, on = 'CUST_NO', how = 'left')

In [30]:
Y_cc.loc[Y_cc['GENDER_CODE'].isnull(),:].shape

(146, 19)

##### **3.1 處裡基本屬性資料**

In [35]:
Y_cc.isnull().any()

CUST_NO              False
AGE                  False
CHILDREN_CNT         False
EDU_CODE             False
GENDER_CODE          False
INCOME_RANGE_CODE    False
WORK_MTHS            False
web_1                False
web_2                False
web_3                False
web_4                False
web_5                False
web_6                False
web_7                False
CC_count             False
CC_M4                False
WM_M4                False
FX_M4                False
LN_M4                False
dtype: bool

In [32]:
m_edu = round(Y_cc.loc[:,'EDU_CODE'].mean())
m_income = round(Y_cc.loc[:, 'INCOME_RANGE_CODE'].mean())
m_work = round(Y_cc.loc[:, 'WORK_MTHS'].mean())
m_age = round(Y_cc.loc[:, 'AGE'].mean())
name_ = Y_cc.columns.tolist()[1:7]
mean_ = [m_age, 0, m_edu, 2, m_income, m_work] 
for i,j in zip(name_, mean_):
    Y_cc.loc[:,i] = Y_cc.loc[:, i].fillna(j)

In [34]:
Y_cc = Y_cc.fillna(0)
Y_cc_list = [i for i in Y_cc.columns if i !='CUST_NO']
Y_cc[Y_cc_list] = Y_cc[Y_cc_list].apply(lambda x:x.astype(int))

#### **4. 決策樹**

In [36]:
X_train = cc_forecast.drop(['CUST_NO', 'CC_Y'], axis = 1)
Y_train = cc_forecast['CC_Y']
X_test = Y_cc.drop(['CUST_NO'], axis = 1)

In [37]:
print('X_train :',X_train.shape)
print('Y_train', Y_train.shape)
print('X_test :', X_test.shape)
# print('Y_test', Y_test.shape)

X_train : (187679, 18)
Y_train (187679,)
X_test : (30000, 18)


In [38]:
cc_tree = DecisionTreeClassifier(criterion = "entropy",
                               max_depth=3, min_samples_leaf=5)
cc_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [39]:
cc_predict = cc_tree.predict(X_test).tolist()

In [29]:
# pd.Series(cc_predict.tolist()).value_counts()

In [40]:
Y_zero['CC_IND'] = cc_predict

In [43]:
# Y_zero[Y_zero['CC_IND'] == 0]

In [47]:
TBN_Y_CC = Y_zero.loc[:,['CUST_NO', 'CC_IND']]
TBN_Y_CC.head()
TBN_Y_CC.to_csv('./data/TBN_Y_CC.csv', index = False)