In [18]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

dfoff = pd.read_csv('train_offline.csv')
dftest = pd.read_csv('test_offline.csv')

#drop the rows whose Coupoon_id == Nan
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True) 

print(dfoff.shape)
print(dftest.shape)
#dftest.head(20)

(1160742, 7)
(306313, 6)


In [19]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [33]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [str(7.0), str(6.0)] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [str(7), str(6)] else 0 ) # apply to testset

In [30]:
dfoff['weekday'].value_counts()

7.0    151239
5.0    125726
1.0    119318
6.0    108135
4.0     91726
3.0     76305
2.0     74520
Name: weekday, dtype: int64

In [31]:
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [str(7.0), str(6.0)] else 0 ) # apply to trainset

In [35]:
dftest['weekday_type'].value_counts()

0    212862
1     93451
Name: weekday_type, dtype: int64