In [1]:
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [2]:
data_path = 'data/'

df_train = pd.read_csv(data_path + 'train_offline.csv')
df_test = pd.read_csv(data_path + 'test_offline.csv')

df_test = df_test[~df_test.Coupon_id.isna()]  # 排除 Coupon_id 為空的資料
df_test.reset_index(drop=True, inplace=True)

print(df_train.shape)
print(df_test.shape)

df_train.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
df_test.head(20)  # df_test 為測試資料，沒有 date 欄位（購買商品時間）

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,8591.0,20:1,0.0,20160516.0
3,2029232,450,1532.0,30:5,0.0,20160530.0
4,2029232,6459,12737.0,20:1,0.0,20160519.0
5,2747744,6901,1097.0,50:10,,20160606.0
6,196342,1579,10698.0,20:1,1.0,20160606.0
7,253750,6901,2366.0,30:5,0.0,20160518.0
8,343660,4663,11002.0,150:20,,20160528.0
9,1113008,3621,2705.0,20:5,0.0,20160524.0


In [4]:
## Creat target label

"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

df_train["label"] = df_train.apply(label, axis=1)  # 增加一個 label 欄位
df_train["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [5]:
df_train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0
6,73611,2099,12034.0,100:10,,20160207.0,,0
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0


In [6]:
# Generate features - weekday acquired coupon

def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

df_train['weekday'] = df_train['Date_received'].apply(getWeekday)  # 新增一個 weekday 欄位，用 Date_received 來轉
df_test['weekday'] = df_test['Date_received'].apply(getWeekday)  # 新增一個 weekday 欄位，用 Date_received 來轉

In [7]:
# weekday_type (weekend = 1)
df_train['weekday_type'] = df_train['weekday'].apply(lambda x: 1 if x in [6, 7]  else 0) # apply to trainset
df_test['weekday_type'] = df_test['weekday'].apply(lambda x: 1 if x in [6, 7]  else 0) # apply to testset

In [8]:
df_train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0
6,73611,2099,12034.0,100:10,,20160207.0,,0,7.0,1
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0


In [9]:
df_test.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,1
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0
5,2747744,6901,1097.0,50:10,,20160606.0,1,0
6,196342,1579,10698.0,20:1,1.0,20160606.0,1,0
7,253750,6901,2366.0,30:5,0.0,20160518.0,3,0
8,343660,4663,11002.0,150:20,,20160528.0,6,1
9,1113008,3621,2705.0,20:5,0.0,20160524.0,2,0


In [48]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

# 在機器學習建模前要先做資料預處理，如果遇到類別特徵（Categorical features），要先轉換成數值，但因為類別沒有順序之分，
# 所以不可以轉換成有大小差別的數值，而要轉換成虛擬變數（Dummy variable），轉換的方法就叫做 one-hot encoding（獨熱編碼）。

tmpdf = pd.get_dummies(df_train['weekday'].replace(-1, np.nan)) # one-hot encoding 轉換 weekday 欄位資料，資料為 -1 時轉成 Nan
tmpdf.columns = weekdaycols
df_train[weekdaycols] = tmpdf  # 新增一個 weekdaycols，把 tmpdf 帶進來

tmpdf = pd.get_dummies(df_test['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_test[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [11]:
df_train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0.0,,20160217.0,-1,,0,0,0,0,0,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,0,0,1,0,0,0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,0,0,0,0,0,1,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,0,0,0,0,1,0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,0,0,0,0,1,0,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,0,0,0,0,1,0,0
6,73611,2099,12034.0,100:10,,20160207.0,,0,7.0,1,0,0,0,0,0,0,1
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0,0,0,0,1,0,0,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1,0,0,0,0,0,1,0
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,0,1,0,0,0,0,0


In [12]:
df_test.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,1,0,0,0,0,0,1,0
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0,1,0,0,0,0,0,0
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0,1,0,0,0,0,0,0
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0,1,0,0,0,0,0,0
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0,0,0,0,1,0,0,0
5,2747744,6901,1097.0,50:10,,20160606.0,1,0,1,0,0,0,0,0,0
6,196342,1579,10698.0,20:1,1.0,20160606.0,1,0,1,0,0,0,0,0,0
7,253750,6901,2366.0,30:5,0.0,20160518.0,3,0,0,0,1,0,0,0,0
8,343660,4663,11002.0,150:20,,20160528.0,6,1,0,0,0,0,0,1,0
9,1113008,3621,2705.0,20:5,0.0,20160524.0,2,0,0,1,0,0,0,0,0


In [13]:
# Generate features - coupon discount and distance

def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1  # 有折扣
    else:
        return 0  # 沒有折扣 

def convertRate(row):  # 折扣率
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])  # 如 1-(20 / 200) = 0.9
    else:
        return float(row)

def getDiscountMan(row):   # 購物滿 x 元
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):  # 減 y 元
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

df_train = processData(df_train)
df_test = processData(df_test)

In [14]:
df_train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.5,10,5,1
6,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,1,...,0,0,0,0,0,1,0.9,100,10,1
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0,...,0,0,1,0,0,0,0.85,200,30,1
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1,...,0,0,0,0,1,0,0.9,200,20,1
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,...,1,0,0,0,0,0,0.9,200,20,1


In [15]:
df_test.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,1,0,0,0,0,0,1,0,0.866667,150,20,1
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0,1,0,0,0,0,0,0,0.95,20,1,1
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0,1,0,0,0,0,0,0,0.95,20,1,1
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0,1,0,0,0,0,0,0,0.833333,30,5,1
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0,0,0,0,1,0,0,0,0.95,20,1,1
5,2747744,6901,1097.0,50:10,99.0,20160606.0,1,0,1,0,0,0,0,0,0,0.8,50,10,1
6,196342,1579,10698.0,20:1,1.0,20160606.0,1,0,1,0,0,0,0,0,0,0.95,20,1,1
7,253750,6901,2366.0,30:5,0.0,20160518.0,3,0,0,0,1,0,0,0,0,0.833333,30,5,1
8,343660,4663,11002.0,150:20,99.0,20160528.0,6,1,0,0,0,0,0,1,0,0.866667,150,20,1
9,1113008,3621,2705.0,20:5,0.0,20160524.0,2,0,0,1,0,0,0,0,0,0.75,20,5,1


In [16]:
## Naive model

def split_train_valid(row, date_cut="20160408"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = df_train[df_train['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)

train = df[df["is_train"]]
valid = df[~df["is_train"]]

train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 634547, #positive: 31237
Valid size: 112422, #positive: 5067


In [17]:
train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,is_train
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,1,0,0,0,0,0.95,20,1,1,True
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,...,0,0,0,1,0,0.95,20,1,1,True
2,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,1,0,0,0.9,200,20,1,True
3,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,0,0,1,0,0,0.5,10,5,1,True
4,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,1,...,0,0,0,0,1,0.9,100,10,1,True
5,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1,...,0,0,0,1,0,0.9,200,20,1,True
6,253750,8390,7531.0,20:5,0.0,20160327.0,,0,7.0,1,...,0,0,0,0,1,0.75,20,5,1,True
7,376492,1041,13490.0,30:5,2.0,20160127.0,,0,3.0,0,...,1,0,0,0,0,0.833333,30,5,1,True
8,1964720,7884,6704.0,20:1,10.0,20160215.0,,0,1.0,0,...,0,0,0,0,0,0.95,20,1,1,True
9,1113008,1041,11197.0,30:5,2.0,20160114.0,,0,4.0,0,...,0,1,0,0,0,0.833333,30,5,1,True


In [18]:
valid.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,is_train
0,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,1,0,0,0.9,200,20,1,False
1,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0,...,0,1,0,0,0,0.85,200,30,1,False
2,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,...,0,0,0,0,0,0.9,200,20,1,False
3,4061024,3381,7610.0,200:20,10.0,20160426.0,,0,2.0,0,...,0,0,0,0,0,0.9,200,20,1,False
4,4061024,7555,9871.0,30:5,10.0,20160409.0,,0,6.0,1,...,0,0,0,1,0,0.833333,30,5,1,False
5,106443,450,3732.0,30:5,99.0,20160429.0,,0,5.0,0,...,0,0,1,0,0,0.833333,30,5,1,False
6,114747,1569,5054.0,200:30,9.0,20160426.0,,0,2.0,0,...,0,0,0,0,0,0.85,200,30,1,False
7,6038208,1569,5054.0,200:30,0.0,20160417.0,,0,7.0,1,...,0,0,0,0,1,0.85,200,30,1,False
8,623832,4442,7643.0,100:50,99.0,20160429.0,,0,5.0,0,...,0,0,1,0,0,0.5,100,50,1,False
9,918744,7555,9871.0,30:5,99.0,20160415.0,,0,5.0,0,...,0,0,1,0,0,0.833333,30,5,1,False


In [37]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols

print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [38]:
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        average=False,
        epsilon=0.0001,
        max_iter=2000,
        eta0=0.0,
        power_t=0.2,
        learning_rate='optimal',
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        random_state=None,
        shuffle=True, 
        n_jobs=1,
        class_weight=None,
        verbose=0, 
        warm_start=False)

    # 在 StandardScaler 的构造函数中设置 with_mean=False 或者 with_std=False，可以禁止均值中心化（centering）和归一化（scaling）。
    
    model = Pipeline(steps=[
        ('ss', StandardScaler(copy=True, with_mean=False, with_std=False)),
        ('en', classifier())        
    ])

    parameters = {
        'en__alpha': [0.00001, 0.0001, 0.001],
        'en__l1_ratio': [0.00001, 0.0001, 0.001]
    }

    folder = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1,
        scoring=None, 
        iid=True, 
        refit=True,
        pre_dispatch='2*n_jobs',
        error_score='raise')
    
    grid_result = grid_search.fit(data[predictors], data['label'])
    
    print(grid_result.best_params_)
    
    return grid_result

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [39]:
model = check_model(train, predictors)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  2.5min finished


{'en__alpha': 1e-05, 'en__l1_ratio': 1e-05}


In [40]:
y_valid_pred = model.predict_proba(valid[predictors])  # 基于 GridSearchCV 对验证集做出预测，y_valid_pred 为预测的概率
print(y_valid_pred)

valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(10)

[[9.99987441e-01 1.25590552e-05]
 [9.99990659e-01 9.34053794e-06]
 [9.99981682e-01 1.83183482e-05]
 ...
 [9.99992819e-01 7.18113990e-06]
 [9.99993696e-01 6.30420893e-06]
 [9.09425870e-01 9.05741304e-02]]


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,is_train,pred_prob
0,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,1,0,0,0.9,200,20,1,False,1.3e-05
1,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0,...,1,0,0,0,0.85,200,30,1,False,9e-06
2,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,...,0,0,0,0,0.9,200,20,1,False,1.8e-05
3,4061024,3381,7610.0,200:20,10.0,20160426.0,,0,2.0,0,...,0,0,0,0,0.9,200,20,1,False,1.5e-05
4,4061024,7555,9871.0,30:5,10.0,20160409.0,,0,6.0,1,...,0,0,1,0,0.833333,30,5,1,False,0.060585
5,106443,450,3732.0,30:5,99.0,20160429.0,,0,5.0,0,...,0,1,0,0,0.833333,30,5,1,False,0.005296
6,114747,1569,5054.0,200:30,9.0,20160426.0,,0,2.0,0,...,0,0,0,0,0.85,200,30,1,False,1.1e-05
7,6038208,1569,5054.0,200:30,0.0,20160417.0,,0,7.0,1,...,0,0,0,1,0.85,200,30,1,False,7e-06
8,623832,4442,7643.0,100:50,99.0,20160429.0,,0,5.0,0,...,0,1,0,0,0.5,100,50,1,False,3.3e-05
9,918744,7555,9871.0,30:5,99.0,20160415.0,,0,5.0,0,...,0,1,0,0,0.833333,30,5,1,False,0.005296


In [41]:
from sklearn.metrics import roc_auc_score, accuracy_score

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))

print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.763, Accuracy: 0.955


In [24]:
targetset = df_test.copy()

print(targetset.shape)

targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])

test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]

print(test1.shape)

(306313, 19)
(306313, 15)


In [25]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)

print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [26]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label

out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("baseline.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.098141
1,1000020_8192_20160513,0.077435
2,1000065_1455_20160527,0.055963
3,1000085_8067_20160513,0.050333
4,1000086_2418_20160613,0.040375
