In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

DATA_ROOT = "./data/ml100marathon-02-01/"

In [2]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [4]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [5]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [6]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [7]:
def processCoupon(df):
    t =  df[['User_id']]
    t['all_coupon_count'] = 1
    t = t.groupby('User_id').agg('sum').reset_index()
    df = pd.merge(df, t, on='User_id', how='left')

    t = df[['User_id','Coupon_id']]
    t['same_coupon_count'] = 1
    t = t.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
    df = pd.merge(df, t, on=['User_id','Coupon_id'], how='left')
    
    df = df[~np.isnan(df['Coupon_id'])].copy()
    df[np.isnan(df['all_coupon_count'])] = 0
    df[np.isnan(df['same_coupon_count'])] = 0
    
    return df

dfoff = processCoupon(dfoff)
dftest = processCoupon(dftest)

In [8]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [9]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [10]:
from sklearn.metrics import roc_auc_score, accuracy_score

def make_validation(model, valid, predictors):
    y_valid_pred = model.predict_proba(valid[predictors])
    auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
    acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
    print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

In [11]:
def make_output(out_file, model, dftest, predictors):
    targetset = dftest.copy()
    print('targetset.shape:', targetset.shape)
    targetset = targetset[~targetset.Coupon_id.isna()]
    targetset.reset_index(drop=True, inplace=True)

    y_test_pred = model.predict_proba(targetset[predictors])
    test1 = targetset[predictors].copy()
    test1['pred_prob'] = y_test_pred[:, 1]
    print('test1.shape:', test1.shape)
    
    output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
    print('output.shape:', output.shape)

    output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
    output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
    output.reset_index(drop=True, inplace=True)
    
    out = output.groupby("uid", as_index=False).mean()
    out = out[["uid", "pred_prob"]]
    out.columns = ["uid", "label"]
    out.to_csv(out_file, header=["uid", "label"], index=False) # submission format

In [12]:
def check_model_SGD(data, predictors):
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [13]:
def check_model_GDBT_RS(data, predictors, n_iter, cv):
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import RandomizedSearchCV
    
    clf = GradientBoostingClassifier()
    
    param_dist = {'n_estimators': [250, 500, 1000], 
                 'learning_rate': [0.01, 0.03, 0.05],
                 'min_samples_split': [2, 4, 6],
                 'min_samples_leaf': [3, 5, 7]}
    
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter, cv=cv, iid=False, verbose=20, n_jobs=-1)
    random_search.fit(data[predictors], data['label'])
    return random_search

In [14]:
def check_model_LR(data, predictors):
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
    lr.fit(data[predictors], data['label'])
    return lr

In [15]:
def check_model_RF(data, predictors):
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                                max_features='sqrt', max_depth=6, bootstrap=True)
    rf.fit(data[predictors], data['label'])
    return rf

In [16]:
def check_model_GDBT(data, predictors):
    from sklearn.ensemble import GradientBoostingClassifier
    gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=100,
                                      max_depth=6, learning_rate=0.03)
    gdbt.fit(data[predictors], data['label'])
    return gdbt

In [17]:
def check_model_XGB(data, predictors):
    from xgboost import  XGBClassifier
    xgb =XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
           colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
           max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
           n_estimators=5, n_jobs=1, nthread=None, objective='binary:logistic',
           random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
           seed=None, silent=None, subsample=1, verbosity=1)
    xgb.fit(data[predictors], data['label'])
    return xgb

In [18]:
def check_model_Stacking(data, predictors, classifiers):
    from sklearn.ensemble import GradientBoostingClassifier
    from mlxtend.classifier import StackingClassifier
    meta_estimator = GradientBoostingClassifier(tol=100, subsample=0.70, n_estimators=50, 
                                               max_features='sqrt', max_depth=4, learning_rate=0.3)
    stacking = StackingClassifier(classifiers=classifiers, 
                                  use_probas=True, 
                                  average_probas=False, 
                                  meta_classifier=meta_estimator)
    stacking.fit(data[predictors], data['label'])
    return stacking

In [52]:
predictors = [
    'Merchant_id',
    'Coupon_id',
    'discount_rate',
    'discount_type',
    'discount_man', 
    'discount_jian',
    'Distance', 
    'weekday', 
    'weekday_type',
    'all_coupon_count',
    'same_coupon_count'
]

In [41]:
def check_model_XGB2(data, predictors):
    from xgboost import  XGBClassifier
    xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
           colsample_bynode=0.9, colsample_bytree=0.9, gamma=0, learning_rate=0.1,
           max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
           n_estimators=250, n_jobs=-1, objective='binary:logistic',
           random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
           seed=None, silent=None, subsample=0.9, verbosity=1)
    xgb.fit(data[predictors], data['label'])
    return xgb

In [42]:
name = 'xgb2'
model = check_model_XGB2(train, predictors)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
xgb = model

Validation AUC: 0.848, Accuracy: 0.953
targetset.shape: (306313, 21)
test1.shape: (306313, 11)
output.shape: (306313, 4)


In [53]:
name = 'sdg'
model = check_model_SGD(train, predictors)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
sdg = model

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   10.7s finished


Validation AUC: 0.820, Accuracy: 0.952
targetset.shape: (306313, 21)
test1.shape: (306313, 12)
output.shape: (306313, 4)


In [54]:
name = 'lr'
model = check_model_LR(train, predictors)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
lr = model

Validation AUC: 0.723, Accuracy: 0.952
targetset.shape: (306313, 21)
test1.shape: (306313, 12)
output.shape: (306313, 4)


In [55]:
name = 'rf'
model = check_model_RF(train, predictors)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
rf = model

Validation AUC: 0.842, Accuracy: 0.953
targetset.shape: (306313, 21)
test1.shape: (306313, 12)
output.shape: (306313, 4)


In [56]:
name = 'gdbt'
model = check_model_GDBT(train, predictors)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
gdbt = model

Validation AUC: 0.841, Accuracy: 0.954
targetset.shape: (306313, 21)
test1.shape: (306313, 12)
output.shape: (306313, 4)


In [57]:
name = 'xgb'
model = check_model_XGB(train, predictors)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
xgb = model

Validation AUC: 0.821, Accuracy: 0.953
targetset.shape: (306313, 21)
test1.shape: (306313, 12)
output.shape: (306313, 4)


In [46]:
name = 'stacking_rf_gdbt'
classifiers = [rf, gdbt]
model = check_model_Stacking(train, predictors, classifiers)
make_validation(model, valid, predictors)
make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)

Validation AUC: 0.829, Accuracy: 0.952
targetset.shape: (306313, 21)
test1.shape: (306313, 11)
output.shape: (306313, 4)


In [34]:
predictors = [
    'discount_rate',
    'discount_type',
    'discount_man', 
    'discount_jian',
    'Distance', 
    'weekday', 
    'weekday_type',
    'all_coupon_count',
    'same_coupon_count'
]

In [48]:
# name = 'gdbt_rs'
# model = check_model_GDBT_RS(train, predictors, 5, 3)
# print(model.best_params_)
# make_validation(model, valid, predictors)
# make_output('Day_051_{}.csv'.format(name), model, dftest, predictors)
# gdbt_rs = model

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  6.1min
[Parallel(n_jobs=4)]: Done   9 out of  15 | elapsed:  9.0min remaining:  6.0min
[Parallel(n_jobs=4)]: Done  10 out of  15 | elapsed: 11.8min remaining:  5.9min
[Parallel(n_jobs=4)]: Done  11 out of  15 | elapsed: 11.8min remaining:  4.3min
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed: 11.9min remaining:  3.0min
[Parallel(n_jobs=4)]: Done  13 out of  15 | elapsed: 12.0min remaining:  1.8min
[Parallel(n_jobs=4)]: Done  15 out of

{'n_estimators': 250, 'min_samples_split': 6, 'min_samples_leaf': 5, 'learning_rate': 0.03}
Validation AUC: 0.784, Accuracy: 0.952
targetset.shape: (306313, 19)
test1.shape: (306313, 15)
output.shape: (306313, 4)
