In [20]:
import utils
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from transformer import DataFrameSelecter
from transformer import DiscountConverter
from transformer import DateToWeekConverter
from transformer import CategoryConverter

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Machine learning models
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC

# model selection
from sklearn.model_selection import cross_val_predict

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve

## Acquire data

In [2]:
train = utils.load_data('ccf_offline_stage1_train.csv')
test = utils.load_data('ccf_offline_stage1_test_revised.csv')

In [3]:
train.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,
5,1439408,2632,,,0.0,,20160516.0
6,1439408,2632,8591.0,20:1,0.0,20160516.0,20160613.0
7,1832624,3381,7610.0,200:20,0.0,20160429.0,
8,2029232,3381,11951.0,200:20,1.0,20160129.0,
9,2029232,450,1532.0,30:5,0.0,20160530.0,


In [4]:
train['Date_received'] =  pd.to_datetime(train['Date_received'], format='%Y%m%d')
test['Date_received'] =  pd.to_datetime(train['Date_received'], format='%Y%m%d')
train['Date'] =  pd.to_datetime(train['Date'], format='%Y%m%d')

## Make labels

In [5]:
train['label'] = 1
train.loc[train['Date'].isnull() | train['Coupon_id'].isnull(), 'label'] = 0
train.loc[(train['Date'] - train['Date_received']).dt.days > 15, 'label'] = 0

In [6]:
train.loc[train['label'] == 1].head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
33,1113008,1361,11166.0,20:1,0.0,2016-05-15,2016-05-21,1
38,2881376,8390,7531.0,20:5,0.0,2016-03-21,2016-03-29,1
69,114747,6901,2366.0,30:5,0.0,2016-05-23,2016-06-05,1
76,114747,5341,111.0,30:5,0.0,2016-02-07,2016-02-18,1
77,114747,5341,7751.0,50:10,0.0,2016-01-27,2016-01-28,1


## Build custom transformers for data preprocessing pipelines

**Convert "Date_received" to weekdays, and then to onehot matrix.**

In [8]:
date_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Date_received'])),
    ('week_converter', DateToWeekConverter()),
    ('one_hot', OneHotEncoder(categories="auto")),
])


**Fill NA with 0 in "Discount_rate" feature,  make elements in the same format, and scale them.**

In [9]:
discount_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Discount_rate'])),
    ('discount_converter', DiscountConverter()),
    ('std_scaler', StandardScaler()),
])

In [10]:
Merchant_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Merchant_id'])),
    ('Category_converter', CategoryConverter(scale=1800)),
    ('one_hot', OneHotEncoder(categories="auto")),
])

In [11]:
distance_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Distance'])),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [12]:
full_pipeline = FeatureUnion(transformer_list=[
    ("date_pipeline", date_pipeline),
    ("discount_pipeline", discount_pipeline),
    ("merchant_pipeline", Merchant_pipeline),
    ("distance_pipeline", distance_pipeline),
])

In [13]:
y_train = train["label"]
X_train = full_pipeline.fit_transform(train)

## Select and train a model
**Linear Regression--Underfitting**

In [15]:
log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
y_train_pred = cross_val_predict(log_reg, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

array([[1690489,       0],
       [  64395,       0]])

In [17]:
# valid predict
y_valid_pred = log_reg.predict_proba(X_train)
train_copy = train.copy()
train_copy['Probability'] = y_valid_pred[:, 1]
train_copy.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Probability
0,1439408,2632,,,0.0,NaT,2016-02-17,0,1.3e-05
1,1439408,4663,11002.0,150:20,1.0,2016-05-28,NaT,0,0.069155
2,1439408,2632,8591.0,20:1,0.0,2016-02-17,NaT,0,0.078463
3,1439408,2632,1078.0,20:1,0.0,2016-03-19,NaT,0,0.064781
4,1439408,2632,8591.0,20:1,0.0,2016-06-13,NaT,0,0.066553


In [18]:
# avgAUC calculation
vg = train_copy.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['Probability'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

0.5527043392107402


In [None]:
# test prediction for submission
y_test_pred = log_reg.predict_proba(full_pipeline.fit_transform(test))
test_copy = test[['User_id','Coupon_id','Date_received']].copy()
test_copy['Probability'] = y_test_pred[:,1]
test_copy.head(5)

In [None]:
test_copy.to_csv('submit1.csv', index=False, header=False)

**Try XGBoost**

In [32]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'min_child_weight': 5,
    'learning_rate': 0.2,
    'silent': False,
    'n_estimater': 140,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 1,
}

In [33]:
xgb_cls = XGBClassifier(**params)
xgb_cls.fit(X_train, y_train)

[11:45:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:14] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:14] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:45:15] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:15] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:17] /work

[11:45:50] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=5
[11:45:51] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[11:45:51] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[11:45:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[11:45:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:45:53] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[11:45:54] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:45:54] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[11:45:55] /work

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=5, min_child_weight=5, missing=None, n_estimater=140,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=False,
       subsample=0.8)

In [34]:
y_train_pred = cross_val_predict(xgb_cls, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

[11:46:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=5
[11:46:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:46:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:46:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:46:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:46:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:46:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:46:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:46:14] /work

[11:46:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=5
[11:46:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[11:46:37] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:46:37] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[11:46:37] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[11:46:38] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:46:38] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[11:46:38] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:46:39] /work

[11:47:05] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:06] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:06] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:47:06] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:47:07] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:07] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:07] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:08] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:08] /work

[11:47:34] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[11:47:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:47:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:47:37] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[11:47:37] /work

[11:47:59] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[11:48:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:48:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[11:48:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[11:48:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[11:48:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[11:48:02] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[11:48:02] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[11:48:02] /work

array([[1690489,       0],
       [  64395,       0]])

In [35]:
# valid predict
y_valid_pred = xgb_cls.predict_proba(X_train)
train_copy = train.copy()
train_copy['Probability'] = y_valid_pred[:, 1]
train_copy.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Probability
0,1439408,2632,,,0.0,NaT,2016-02-17,0,4e-06
1,1439408,4663,11002.0,150:20,1.0,2016-05-28,NaT,0,0.006796
2,1439408,2632,8591.0,20:1,0.0,2016-02-17,NaT,0,0.184095
3,1439408,2632,1078.0,20:1,0.0,2016-03-19,NaT,0,0.157576
4,1439408,2632,8591.0,20:1,0.0,2016-06-13,NaT,0,0.168905


In [36]:
# avgAUC calculation
vg = train_copy.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['Probability'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

0.5560653150065362


In [37]:
# test prediction for submission
y_test_pred = log_reg.predict_proba(full_pipeline.fit_transform(test))
test_copy = test[['User_id','Coupon_id','Date_received']].copy()
test_copy['Probability'] = y_test_pred[:,1]
test_copy.head(5)
test_copy.to_csv('submit1.csv', index=False, header=False)