### Data preprocessing

In [1]:
# package which we need while importing data
import os
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# set data_path
dir_data = './data/midterm-data/'
train_data_path = os.path.join(dir_data, 'train_offline.csv')
test_data_path = os.path.join(dir_data, 'test_offline.csv')

# read data from data_path
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
test_data = test_data[~test_data.Coupon_id.isna()]
test_data.reset_index(drop=True, inplace=True)
print("train_data shape: ", train_data.shape)
print("test_data shape: ", test_data.shape)

train_data shape:  (1160742, 7)
test_data shape:  (306313, 6)


In [3]:
train_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [4]:
test_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,8591.0,20:1,0.0,20160516.0
3,2029232,450,1532.0,30:5,0.0,20160530.0
4,2029232,6459,12737.0,20:1,0.0,20160519.0


In [6]:
## creat target label 
def createLabel(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        interval = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if interval <= pd.Timedelta(15, 'D'):
            return 1
    return 0

train_data["label"] = train_data.apply(createLabel, axis=1)
train_data["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

### Feature Engineering

In [7]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

train_data['weekday'] = train_data['Date_received'].apply(getWeekday)
test_data['weekday'] = test_data['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
train_data['weekday_type'] = train_data['weekday'].apply(lambda x: 1 if x == 6 or x == 7 else 0 ) # apply to trainset
test_data['weekday_type'] = test_data['weekday'].apply(lambda x: 1 if x == 6 or x == 7 else 0 ) # apply to testset

In [8]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]

tmpdf = pd.get_dummies(train_data['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
train_data[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(test_data['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
test_data[weekdaycols] = tmpdf

In [9]:
train_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0.0,,20160217.0,-1,,0,0,0,0,0,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,0,0,1,0,0,0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,0,0,0,0,0,1,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,0,0,0,0,1,0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,0,0,0,0,1,0,0


In [10]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

train_data = processData(train_data)
test_data = processData(test_data)

In [11]:
train_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1


### Fit the model

In [12]:
train_data = train_data[~train_data.label<0]

In [None]:
feature_chosen = ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 
                  'Distance', 'weekday', 'weekday_type', 
                  'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']

In [13]:
train_Y = train_data['label']
train_X = train_data[feature_chosen]

In [14]:
test_X = test_data[feature_chosen]

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=100, max_features=10,
                                  max_depth=6, learning_rate=0.03)
rf = RandomForestClassifier(n_estimators=80, min_samples_split=2, min_samples_leaf=1, 
                            max_features='sqrt', max_depth=6, bootstrap=True)

In [16]:
lr.fit(train_X, train_Y)
lr_pred = lr.predict_proba(test_X)[:,1]
lr_pred



array([0.01249483, 0.07370941, 0.07370941, ..., 0.0656722 , 0.04363549,
       0.07627815])

In [17]:
gdbt.fit(train_X, train_Y)
gdbt_pred = gdbt.predict_proba(test_X)[:,1]
gdbt_pred

array([0.02197406, 0.12877703, 0.12877703, ..., 0.04595844, 0.00804889,
       0.22314247])

In [18]:
rf.fit(train_X, train_Y)
rf_pred = rf.predict_proba(test_X)[:,1]
rf_pred

array([0.01510726, 0.13813094, 0.13813094, ..., 0.05405339, 0.01330937,
       0.19711022])

### Save the result

In [19]:
def createUid(row):
    return str(row['User_id']) + '_' + str(int(row['Coupon_id'])) + '_' + str(int(row['Date_received']))

submit = pd.DataFrame()
submit['uid'] = test_data.apply(createUid, axis=1)
predict = lr_pred * 0.4 + gdbt_pred * 0.3 + rf_pred * 0.3                          
submit['label'] = predict

submit = submit.groupby("uid", as_index=False).mean()
submit.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.170692
1,1000020_8192_20160513,0.142802
2,1000065_1455_20160527,0.103743
3,1000085_8067_20160513,0.105389
4,1000086_2418_20160613,0.10887


In [20]:
submit.shape

(304096, 2)

In [21]:
submit.to_csv(os.path.join(dir_data, 'midterm.csv'), index=False)