### **優惠券預測**
[攻略](https://www.twblogs.net/a/5c160498bd9eee5e41842891)
[攻略1](https://github.com/wepe/O2O-Coupon-Usage-Forecast)

In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [2]:
data_path = 'data/'
dfoff = pd.read_csv(data_path + 'train_offline.csv')
dftest = pd.read_csv(data_path + 'test_offline.csv')
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
# dfoff.head(20)

(1160742, 7)
(306313, 6)


In [4]:
import gc
gc.collect()

51

In [5]:
pd.Timedelta(15, 'D')

Timedelta('15 days 00:00:00')

#### **1. 資料轉換**

In [6]:
dfoff['Date'] = pd.to_datetime(dfoff['Date'], format='%Y%m%d')
dfoff['Date_received'] = pd.to_datetime(dfoff['Date_received'], format='%Y%m%d')
dftest['Date_received'] = pd.to_datetime(dftest['Date_received'], format='%Y%m%d')

### **2.特徵工程**

#### Creat target label 

In [16]:
#有優惠券且有消費之index
dfoff.index.isin(dfoff[(~dfoff['Date_received'].isnull()) & (~dfoff['Date'].isnull())].index)

array([False, False, False, ...,  True, False, False])

In [7]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(df):
    df['label'] = [1 if i <= pd.Timedelta(15, 'D') else 0 for i in df['Date'] - df['Date_received']]
    df.loc[df['Date_received'].isnull(),'label'] = -1
    df['Writeoff_days'] = (df['Date'] - df['Date_received']).dt.days
    df.loc[df['Date_received'].isnull(), 'Writeoff_days'] = np.nan

label(dfoff)

In [8]:
print(dfoff.Date_received.min(),\
dfoff.Date_received.max(),\
dftest.Date_received.min(),\
dftest.Date_received.max())

2016-01-01 00:00:00 2016-04-30 00:00:00 2016-05-01 00:00:00 2016-06-15 00:00:00


#### Generate features - weekday acquired coupon (新增時間變數)

In [9]:
dfoff['Date_received_weekday'] = dfoff['Date_received'].dt.weekday + 1
dftest['Date_received_weekday'] = dftest['Date_received'].dt.weekday + 1

In [10]:
# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = [1 if i in [6,7] else 0 for i in dfoff['Date_received_weekday']]
dftest['weekday_type'] = [1 if i in [6,7] else 0 for i in dftest['Date_received_weekday']]

# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
#print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['Date_received_weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['Date_received_weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

#### **切分 train valid**

In [11]:
df_train = dfoff[(dfoff['label'] != -1) & (dfoff['Date_received']  < '20160416') ].copy().reset_index(drop = True)
df_valid = dfoff[(dfoff['label'] != -1) & (dfoff['Date_received'] >= '20160416')].copy().reset_index(drop = True)
print("Train size: {}, #positive: {}".format(len(df_train), df_train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(df_valid), df_valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


#### **優惠券相關的特徵**
- 優惠券類型(直接優惠為0, 滿減為1)
- 優惠券折率
- 滿減優惠券的最低消費
- 歷史出現次數
- 歷史核銷次數
- 歷史核銷率
- 歷史核銷時間率
- 領取優惠券是一周的第幾天
- 領取優惠券是一月的第幾天
- 歷史上用戶領取該優惠券次數
- 歷史上用戶消費該優惠券次數
- 歷史上用戶對該優惠券的核銷率

In [12]:
# Generate features - coupon discount and distance
#優惠券類型
def getDiscountType(row):
    if row == 'nan':
        return 'nan'
    elif ':' in row:
        return 1
    else:
        return 0

#優惠券折扣率
def convertRate(row):
    """Convert discount to rate"""
    if row == 'nan':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

#滿減優惠的最低消費    
def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

#滿減優惠的折扣費用
def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    
    # convert distance,將na改為距離很遠
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

df_train = processData(df_train)
df_valid = processData(df_valid)
dfoff = processData(dfoff)
dftest = processData(dftest)

In [13]:
# 優惠券歷史出現次數/核銷次數 (核銷率) 注意要用df_train ,dfoff
def Coupon(df,df_m):
    d = df.merge(df_m.groupby(['Coupon_id'])[['Coupon_id']].count().rename(columns = {'Coupon_id':'Coupon_count'}).reset_index(),\
                 on = 'Coupon_id', how = 'left').merge(df_m[df_m['label'] == 1].groupby(['Coupon_id'])[['Coupon_id']].count()\
                .rename(columns = {'Coupon_id':'Coupon_buy'}).reset_index(), on = 'Coupon_id', how = 'left')
    return d

df_train = Coupon(df_train, df_train)
df_valid = Coupon(df_valid, df_train)
dftest = Coupon(dftest, dfoff)

#核銷率
def write_off(df):
    df['Coupon_writeoff'] = df['Coupon_buy']/df['Coupon_count']

write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [36]:
# 歷史核銷時間率, 每個優惠券被領取後，平均核銷天數
# write_off_days
def write_off_days(df, df_m):
    d = df.merge(df_m.groupby(['Coupon_id'])[['Writeoff_days']].mean()\
               .rename(columns = {'Writeoff_days':'Mean_Writeoff_days'}).reset_index(),on = 'Coupon_id', how = 'left')
    return d

df_train = write_off_days(df_train, df_train)
df_valid = write_off_days(df_valid, df_train)
dftest = write_off_days(dftest, dfoff)

In [44]:
# 領取優惠券是該月第幾天
def day(df):
    df['day'] = df['Date_received'].dt.day
day(df_train)
day(df_valid)
day(dftest)

In [45]:
df_train.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'Writeoff_days',
       'Date_received_weekday', 'weekday_type', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7',
       'discount_type', 'discount_rate', 'discount_man', 'discount_jian',
       'Coupon_count', 'Coupon_buy', 'Coupon_writeoff', 'Mean_Writeoff_days',
       'day'],
      dtype='object')

#### **用戶特徵**

- 用戶領取優惠券次數
- 用戶獲得優惠券但沒有消費的次數
- 用戶獲得優惠券並核銷次數
- 用戶領取優惠券後進行核銷率
- 用戶個別優惠券核銷率 # 個別優惠券領取/核銷數 核銷率 (- 用戶核銷過的不同優惠券數量，及其占所有不同優惠券的比重)
- 用戶核銷滿100, 150, 200, 300 減的優惠券佔所有核銷優惠券的比重 
- 用戶核銷過優惠券的不同商家數量，及其占所有不同商家的比重
- 歷史核銷時間率, 每個用戶領取優惠券後，平均核銷天數


#- 用戶核銷優惠券的平均/最低/最高消費折率

In [58]:
#用戶領取優惠券次數/核銷次數/未使用次數
def User(df, df_m):
    d = df.merge(df_m[df_m['label'] != -1].groupby(['User_id'])[['User_id']].count().rename(columns = {'User_id':'User_count'}).reset_index(),\
                 on = 'User_id', how = 'left').merge(df_m[df_m['label'] == 1].groupby(['User_id'])[['User_id']].count()\
                 .rename(columns = {'User_id':'User_buy'}).reset_index(), on = 'User_id', how = 'left')\
                 .merge(df_m[df_m['label'] == 0].groupby(['User_id'])[['User_id']].count()\
                 .rename(columns = {'User_id':'User_unbuy'}).reset_index(), on = 'User_id', how = 'left')
    return d

df_train = User(df_train, df_train)
df_valid = User(df_valid, df_train)
dftest = User(dftest, dfoff)

#核銷率
def write_off(df):
    df['User_writeoff'] = df['User_buy']/df['User_count']

write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [86]:
# 用戶個別優惠券領取次數/核銷數 核銷率
def User_Coupon(df, df_m):
    d = df.merge(df_m[df_m['label'] != -1].groupby(['User_id', 'Coupon_id'])[['Coupon_id']].count()\
                 .rename(columns = {'Coupon_id':'User_coupon_count'}).reset_index(),on = ['User_id','Coupon_id'], how = 'left')\
                 .merge(df_m[df_m['label'] == 1].groupby(['User_id', 'Coupon_id'])[['Coupon_id']].count()\
                 .rename(columns = {'Coupon_id':'User_Coupon_buy'}).reset_index(), on = ['User_id','Coupon_id'], how = 'left')
    return d

df_train = User_Coupon(df_train, df_train)
df_valid = User_Coupon(df_valid, df_train)
dftest = User_Coupon(dftest, dfoff)

#用戶個別優惠券領取核銷率
def write_off(df):
    df['User_coupon_writeoff'] = df['User_Coupon_buy']/df['User_coupon_count']    
    
write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [133]:
# - 用戶核銷滿 100, 150, 200, 300 減的優惠券佔所有核銷優惠券的比重 
# - 用戶核銷滿 5,20, 30, 50 減的優惠券佔所有核銷優惠券的比重 
def man(df,df_m):
    d = df.merge(df_m[(df_train['label'] == 1) & df_m['discount_man'].isin(['100', '150', '200', '300'])]\
          .groupby(['User_id'])[['User_id']].count().rename(columns = {'User_id':'Man_100_buy'}).reset_index(),\
          on = 'User_id', how = 'left')\
          .merge(df_m[(df_train['label'] == 1) & df_m['discount_man'].isin(['5', '20', '30', '50'])]\
          .groupby(['User_id'])[['User_id']].count().rename(columns = {'User_id':'Man_5_buy'}).reset_index(),\
          on = 'User_id', how = 'left')
    return d
    
df_train = man(df_train, df_train)
df_valid = man(df_valid, df_train)
dftest = man(dftest, dfoff)

def write_off(df):
    df['User_Man_100_write_off'] = df['Man_100_buy']/df['User_buy']    
    df['User_Man_5_write_off'] = df['Man_5_buy']/df['User_buy']
    
write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [136]:
# - 用戶個別核銷過的優惠券佔用戶總核銷優惠券比重
def write_off(df):
    df['User_coupon_writeoff_proportion'] = df['User_Coupon_buy']/df['User_buy']

write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [149]:
# - 用戶核銷過優惠券的不同商家數量，及其占所有不同商家的比重
def write_off(df,df_m):
    d = df.merge(df_m[df_m['label'] == 1].groupby(['User_id','Merchant_id'])[['Merchant_id']].count()\
                .rename(columns = {'Merchant_id':'User_Merchant_buy'}).reset_index(), on = ['User_id', 'Merchant_id'], how = 'left')
    return d

df_train = write_off(df_train,df_train)
df_valid = write_off(df_valid,df_train)
dftest = write_off(dftest,dfoff)

def write_off(df):
    df['User_Merchant_writeoff_propotion'] = df['User_Merchant_buy']/df['User_buy']    
    
write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [191]:
# - 歷史核銷時間率, 每個用戶領取優惠券後，平均核銷天數
def write_off_days(df, df_m):
    d = df.merge(df_m[df_m['Writeoff_days'] >= 0].groupby(['User_id'])[['Writeoff_days']].mean()\
                 .rename(columns = {'Writeoff_days':'User_Mean_Writeoff_days'}).reset_index()
                 ,on = 'User_id', how = 'left')
    return d

df_train = write_off_days(df_train,df_train)
df_valid = write_off_days(df_valid, df_train)
dftest = write_off_days(dftest, dfoff)

#### **商家特徵**

- 商家優惠券被領取次數 (Merchant_count)
- 商家優惠券被領取後不核銷次數 (Merchant_unbuy)
- 商家優惠券被領取後核銷次數(Merchant_buy)
- 商家優惠券被領取後核銷率 (Merchant_writeoff)
- 核銷商家優惠券的不同用戶數量，及其占領取不同的用戶比重 (商家優惠券核銷用戶數 Merchant_User_buy 及領取用戶數 Merchant_User_count)
- 商家優惠券平均每個用戶核銷多少張 (Merchant_buy/Merchant_User_buy)
- 商家被核銷過的不同優惠券數量 (Merhcant_Coupon_buy)
- 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重 (Merchant_Coupon_writeoff : M_C_Buy/M_C_Count)
- 歷史核銷時間率, 每個商家的優惠券被領取後，平均核銷天數

#- 商家優惠券核銷的平均/最小/最大消費折率 

In [228]:
#商家優惠券被領取次數 / 不核銷次數 / 核銷次數
def Merchant(df, df_m):
    d = df.merge(df_m[df_m['label'] != -1].groupby(['Merchant_id'])[['Merchant_id']].count()\
                 .rename(columns = {'Merchant_id':'Merchant_count'}).reset_index(),on = 'Merchant_id', how = 'left')\
                 .merge(df_m[df_m['label'] == 1].groupby(['Merchant_id'])[['Merchant_id']].count()\
                 .rename(columns = {'Merchant_id':'Merchant_buy'}).reset_index(), on = 'Merchant_id', how = 'left')\
                 .merge(df_m[df_m['label'] == 0].groupby(['Merchant_id'])[['Merchant_id']].count()\
                 .rename(columns = {'Merchant_id':'Merchant_unbuy'}).reset_index(), on = 'Merchant_id', how = 'left')
    return d

df_train = Merchant(df_train, df_train)
df_valid = Merchant(df_valid, df_train)
dftest = Merchant(dftest, dfoff)

#核銷率
def write_off(df):
    df['Merchant_writeoff'] = df['Merchant_buy']/df['Merchant_count']

write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [245]:
# 商家優惠券核銷用戶數 Merchant_User_buy 及領取用戶數 Merchant_User_count
def Merchant_user(df, df_m):
    d = df.merge(df_m[df_m['label'] != -1].groupby(['Merchant_id'])[['User_id']].nunique()\
                 .rename(columns = {'User_id':'Merchant_User_count'}).reset_index(),on = 'Merchant_id', how = 'left')\
                 .merge(df_m[df_m['label'] == 1].groupby(['Merchant_id'])[['User_id']].nunique()\
                 .rename(columns = {'User_id':'Merchant_User_buy'}).reset_index(), on = 'Merchant_id', how = 'left')
    return d

df_train = Merchant_user(df_train, df_train)
df_valid = Merchant_user(df_valid, df_train)
dftest = Merchant_user(dftest, dfoff)

#核銷率
def write_off(df):
    df['Merchant_User_writeoff'] = df['Merchant_User_buy']/df['Merchant_User_count']

write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [248]:
# 商家優惠券平均每個用戶核銷多少張 (Merchant_buy/Merchant_User_buy)
def Merchant_avguser(df):
    df['Merchant_Avguser_buy'] = df['Merchant_buy'] / df['Merchant_User_buy']

Merchant_avguser(df_train)
Merchant_avguser(df_valid)
Merchant_avguser(dftest)

In [253]:
# - 商家被核銷過的不同優惠券數量 (Merhcant_Coupon_buy)
# - 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重 (Merchant_Coupon_writeoff : M_C_Buy/M_C_Count)
def Merchant_coupon(df, df_m):
    d = df.merge(df_m[df_m['label'] != -1].groupby(['Merchant_id', 'Coupon_id'])[['Coupon_id']].count().rename(columns = \
          {'Coupon_id':'Merhcant_Coupon_count'}).reset_index(), on = ['Merchant_id', 'Coupon_id'], how = 'left')\
          .merge(df_train[df_train['label'] == 1].groupby(['Merchant_id', 'Coupon_id'])[['Coupon_id']].count().rename(columns =\
          {'Coupon_id':'Merhcant_Coupon_buy'}).reset_index(), on = ['Merchant_id', 'Coupon_id'], how = 'left')
    return d

df_train = Merchant_coupon(df_train, df_train)
df_valid = Merchant_coupon(df_valid, df_train)
dftest = Merchant_coupon(dftest, dfoff)

#核銷率
def write_off(df):
    df['Merchant_Coupon_writeoff'] = df['Merhcant_Coupon_buy']/df['Merhcant_Coupon_count']

write_off(df_train)
write_off(df_valid)
write_off(dftest)

In [255]:
# - 歷史核銷時間率, 每個商家的優惠券被領取後，平均核銷天數
def write_off_days(df, df_m):
    d = df.merge(df_m[df_m['Writeoff_days'] >= 0].groupby(['Merchant_id'])[['Writeoff_days']].mean()\
                 .rename(columns = {'Writeoff_days':'Merchant_Mean_Writeoff_days'}).reset_index()
                 ,on = 'Merchant_id', how = 'left')
    return d

df_train = write_off_days(df_train,df_train)
df_valid = write_off_days(df_valid, df_train)
dftest = write_off_days(dftest, dfoff)

#### **用戶商家交互特徵**
- 用戶領取商家的優惠券次數
- 用戶領取商家的優惠券後不核銷次數
- 用戶領取商家的優惠券後核銷次數
- 用戶領取商家的優惠券後核銷率
- 用戶對每個商家的不核銷次數佔用戶總的不核銷次數的比重
- 用戶對每個商家的優惠券核銷次數佔用戶總的核銷次數的比重
- 用戶對每個商家的不核銷次數佔商家總的不核銷次數的比重
- 用戶對每個商家的優惠券核銷次數佔商家總的核銷次數的比重

### **1.1輸出處理好之資料**

In [257]:
# df_train.to_csv('df_train.csv',index = False)
# df_valid.to_csv('df_valid.csv',index = False)
# dfoff.to_csv('dfoff.csv',index = False)
# dftest.to_csv('dftest.csv',index = False)

### **2. 建模**

### **2.1 輸入處理好之資料**

In [13]:
dfoff = pd.read_csv('dfoff.csv')
dftest = pd.read_csv('dftest.csv')

In [262]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'Date_received_weekday', 
                    'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'Date_received_weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [503]:
features = ['Merchant_id', 'Coupon_id','Date_received_weekday', 'Distance', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 
            'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'discount_type', 'discount_rate', 'discount_man', 'discount_jian',
            'Coupon_count', 'Coupon_buy', 'Coupon_writeoff', 'Mean_Writeoff_days', 'day', 'User_count', 'User_buy', 'User_unbuy', 
            'User_writeoff', 'User_coupon_count', 'User_Coupon_buy', 'User_coupon_writeoff', 'Man_100_buy', 'Man_5_buy', 
            'User_Man_100_write_off', 'User_Man_5_write_off', 'User_coupon_writeoff_proportion', 'User_Merchant_buy', 
            'User_Merchant_writeoff_propotion', 'User_Mean_Writeoff_days', 'Merchant_writeoff', 'Merchant_count', 
            'Merchant_buy', 'Merchant_unbuy', 'Merchant_User_count', 'Merchant_User_buy', 'Merchant_User_writeoff', 
            'Merchant_Avguser_buy', 'Merhcant_Coupon_count', 'Merhcant_Coupon_buy','Merchant_Coupon_writeoff', 
            'Merchant_Mean_Writeoff_days']

In [358]:
na_find = ['Coupon_count', 'Coupon_buy', 'Coupon_writeoff', 'Mean_Writeoff_days', 'day', 'User_count', 'User_buy', 
 'User_unbuy', 'User_writeoff', 'User_coupon_count', 'User_Coupon_buy', 'User_coupon_writeoff', 'Man_100_buy', 'Man_5_buy', 
 'User_Man_100_write_off', 'User_Man_5_write_off', 'User_coupon_writeoff_proportion', 'User_Merchant_buy', 
 'User_Merchant_writeoff_propotion', 'User_Mean_Writeoff_days', 'Merchant_writeoff', 'Merchant_count', 
 'Merchant_buy', 'Merchant_unbuy', 'Merchant_User_count', 'Merchant_User_buy', 'Merchant_User_writeoff', 
 'Merchant_Avguser_buy', 'Merhcant_Coupon_count', 'Merhcant_Coupon_buy','Merchant_Coupon_writeoff', 
 'Merchant_Mean_Writeoff_days']

na_col = []
for i,j in  zip(na_find,df_train[na_find].isnull().any()):
    if j == True:
        na_col.append(i)
        
na_0 = list(set(na_col)-set(['Mean_Writeoff_days', 'User_Mean_Writeoff_days', 'Merchant_Mean_Writeoff_days'])) 
na_1 = ['Mean_Writeoff_days', 'User_Mean_Writeoff_days', 'Merchant_Mean_Writeoff_days']

In [394]:
def fillna(df):
    df[na_0] = df[na_0].fillna(0)
    df[na_1] = df[na_1].fillna(-1)

fillna(df_train)
fillna(df_valid)
fillna(dftest)

* Naive model

In [515]:
predictors = features
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

['Merchant_id', 'Coupon_id', 'Date_received_weekday', 'Distance', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'discount_type', 'discount_rate', 'discount_man', 'discount_jian', 'Coupon_count', 'Coupon_buy', 'Coupon_writeoff', 'Mean_Writeoff_days', 'day', 'User_count', 'User_buy', 'User_unbuy', 'User_writeoff', 'User_coupon_count', 'User_Coupon_buy', 'User_coupon_writeoff', 'Man_100_buy', 'Man_5_buy', 'User_Man_100_write_off', 'User_Man_5_write_off', 'User_coupon_writeoff_proportion', 'User_Merchant_buy', 'User_Merchant_writeoff_propotion', 'User_Mean_Writeoff_days', 'Merchant_writeoff', 'Merchant_count', 'Merchant_buy', 'Merchant_unbuy', 'Merchant_User_count', 'Merchant_User_buy', 'Merchant_User_writeoff', 'Merchant_Avguser_buy', 'Merhcant_Coupon_count', 'Merhcant_Coupon_buy', 'Merchant_Coupon_writeoff', 'Merchant_Mean_Writeoff_days']


In [516]:
model = check_model(df_train, predictors)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.8min finished


In [517]:
#預測 0416 ~ 0430, validation
df_valid_pred = model.predict_proba(df_valid[predictors].fillna(0))
df_valid_1 = df_valid.copy()
df_valid_1['pred_prob'] = df_valid_pred[:, 1]

In [518]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true = df_valid.label, y_score = df_valid_pred[:,1])
acc = accuracy_score(y_true = df_valid.label, y_pred = df_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.807, Accuracy: 0.950


In [519]:
# 預測 test set
dftest_1 = dftest.copy()
print(dftest_1.shape)
dftest_1 = dftest_1[~dftest_1.Coupon_id.isna()].reset_index(drop=True) #篩出要預測的(有領優惠券)
testset = dftest_1[predictors].copy() #先選部分變數

df_test_pred = model.predict_proba(testset[predictors].fillna(0)) #預測0501~0630
test1 = testset.copy()
test1['pred_prob'] = df_test_pred[:, 1]
print(test1.shape)

(306313, 51)
(306313, 49)


In [520]:
# 轉成 df 檔
output = pd.concat((dftest_1[['User_id', 'Coupon_id', 'Date_received']], test1['pred_prob']), axis=1)
print(output.shape)
output['Date_received'] = output['Date_received'].dt.strftime('%Y%m%d')
output.loc[:, 'User_id'] = output['User_id'].apply(lambda x:str(int(x)))
output.loc[:, 'Coupon_id'] = output['Coupon_id'].apply(lambda x:str(int(x)))
output.loc[:, 'Date_received'] = output['Date_received'].apply(lambda x:str(int(x)))
output['uid'] = output[['User_id', 'Coupon_id', 'Date_received']].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [521]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("Feature_1.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,1.98995e-29
1,1000020_8192_20160513,2.105922e-30
2,1000065_1455_20160527,7.148922e-14
3,1000085_8067_20160513,2.162863e-30
4,1000086_2418_20160613,8.892678e-30


* XGBoost

In [468]:
from xgboost import  XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [469]:
Y = df_train['label']
X_train, X_test, Y_train, Y_test = train_test_split(df_train[predictors], Y, test_size = 0.30, random_state = 0)

In [470]:
print('X_train :', X_train.shape )
print('Y_train :', Y_train.shape )
print('X_test :', X_test.shape )
print('Y_test :', Y_test.shape )

X_train : (467427, 46)
Y_train : (467427,)
X_test : (200326, 46)
Y_test : (200326,)


In [475]:
df_train['label'].values.tolist().count(0)/df_train['label'].values.tolist().count(1)

19.563962798718897

train

In [476]:
xgb = XGBClassifier(scale_pos_weight = 19.5640, min_child_weight = 1
                    ,max_depth = 3,learning_rate = 0.1, random_state = 1000,
                    objective='binary:logistic', n_estimators= 100)

In [477]:
xgb.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1000,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=19.564, seed=None,
              silent=None, subsample=1, verbosity=1)

In [479]:
Y_predict_prob = xgb.predict_proba(X_test)[:,1]

In [None]:
Y_predict = pd.Series(Y_predict_prob).copy()
Y_predict[Y_predict<0.5] = 0
Y_predict[Y_predict>=0.5] = 1
print('Accuracy is ', accuracy_score(Y_test, Y_predict)*100)

In [482]:
from sklearn.metrics import precision_score, recall_score, f1_score
scorePrecision = precision_score(Y_test.values, Y_predict.values)
scoreRecall = recall_score(Y_test.values, Y_predict.values)
scoreF1 = f1_score(Y_test.values, Y_predict.values)
print(f"Precision score = {scorePrecision:.4f}")
print(f"Recall score = {scoreRecall:.4f}")
print(f"F1 score = {scoreF1:.4f}")
print("Roc AUC: ", roc_auc_score(Y_test, Y_predict_prob,
              average='macro'))
print('Accuracy is ', accuracy_score(Y_test, Y_predict)*100)

Precision score = 0.8502
Recall score = 0.9989
F1 score = 0.9186
Roc AUC:  0.9994959792539468
Accuracy is  99.13690684184779


predict validation

In [483]:
# predict validation
Y_valid_prob = xgb.predict_proba(df_valid[predictors].fillna(0))[:,1]
Y_valid = pd.Series(Y_valid_prob).copy()
Y_valid[Y_valid<0.5] = 0
Y_valid[Y_valid>=0.5] = 1
print('Accuracy is ', accuracy_score(df_valid.label, Y_valid)*100)

Accuracy is  94.91138153908302


In [488]:
from sklearn.metrics import precision_score, recall_score, f1_score
scorePrecision = precision_score(df_valid.label.values, Y_valid.values)
scoreRecall = recall_score(df_valid.label.values, Y_valid.values)
scoreF1 = f1_score(df_valid.label.values, Y_valid.values)
print(f"Precision score = {scorePrecision:.4f}")
print(f"Recall score = {scoreRecall:.4f}")
print(f"F1 score = {scoreF1:.4f}")
print("Roc AUC: ", roc_auc_score(df_valid.label, Y_valid_prob,
              average='macro'))
print('Accuracy is ', accuracy_score(df_valid.label, Y_valid)*100)

Precision score = 0.3873
Recall score = 0.0892
F1 score = 0.1451
Roc AUC:  0.5868802046673433
Accuracy is  94.91138153908302


predict test

In [491]:
Y_pred_test_prob = xgb.predict_proba(dftest[predictors].fillna(0))[:,1]

In [492]:
# 轉成 df 檔
output = dftest_1[['User_id', 'Coupon_id', 'Date_received']].copy()
output['pred_prob'] = Y_pred_test_prob
print(output.shape)
output['Date_received'] = output['Date_received'].dt.strftime('%Y%m%d')
output.loc[:, 'User_id'] = output['User_id'].apply(lambda x:str(int(x)))
output.loc[:, 'Coupon_id'] = output['Coupon_id'].apply(lambda x:str(int(x)))
output.loc[:, 'Date_received'] = output['Date_received'].apply(lambda x:str(int(x)))
output['uid'] = output[['User_id', 'Coupon_id', 'Date_received']].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [497]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).sum()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("XGB_Example.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,2.2e-05
1,1000020_8192_20160513,2.2e-05
2,1000065_1455_20160527,9.5e-05
3,1000085_8067_20160513,2.2e-05
4,1000086_2418_20160613,2.2e-05
