In [1]:
import utils
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from transformer import DataFrameSelecter
from transformer import DiscountConverter
from transformer import DateToWeekConverter
from transformer import CategoryConverter

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# model selection
from sklearn.model_selection import cross_val_predict

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve

## Acquire data

In [2]:
train = utils.load_data('ccf_offline_stage1_train.csv')
test = utils.load_data('ccf_offline_stage1_test_revised.csv')

In [3]:
train.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,
5,1439408,2632,,,0.0,,20160516.0
6,1439408,2632,8591.0,20:1,0.0,20160516.0,20160613.0
7,1832624,3381,7610.0,200:20,0.0,20160429.0,
8,2029232,3381,11951.0,200:20,1.0,20160129.0,
9,2029232,450,1532.0,30:5,0.0,20160530.0,


In [4]:
train['Date_received'] =  pd.to_datetime(train['Date_received'], format='%Y%m%d')
test['Date_received'] =  pd.to_datetime(train['Date_received'], format='%Y%m%d')
train['Date'] =  pd.to_datetime(train['Date'], format='%Y%m%d')

## Make labels

In [5]:
train['label'] = 1
train.loc[train['Date'].isnull() | train['Coupon_id'].isnull(), 'label'] = 0
train.loc[(train['Date'] - train['Date_received']).dt.days > 15, 'label'] = 0

In [6]:
train.loc[train['label'] == 1].head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
33,1113008,1361,11166.0,20:1,0.0,2016-05-15,2016-05-21,1
38,2881376,8390,7531.0,20:5,0.0,2016-03-21,2016-03-29,1
69,114747,6901,2366.0,30:5,0.0,2016-05-23,2016-06-05,1
76,114747,5341,111.0,30:5,0.0,2016-02-07,2016-02-18,1
77,114747,5341,7751.0,50:10,0.0,2016-01-27,2016-01-28,1


## Build custom transformers for data preprocessing pipelines

**Convert "Date_received" to weekdays, and then to onehot matrix.**

In [7]:
date_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Date_received'])),
    ('week_converter', DateToWeekConverter()),
    ('one_hot', OneHotEncoder(categories="auto")),
])


**Fill NA with 0 in "Discount_rate" feature,  make elements in the same format, and scale them.**

In [8]:
discount_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Discount_rate'])),
    ('discount_converter', DiscountConverter()),
    ('std_scaler', StandardScaler()),
])

In [9]:
Merchant_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Merchant_id'])),
    ('Category_converter', CategoryConverter(scale=1800)),
    ('one_hot', OneHotEncoder(categories="auto")),
])

In [10]:
distance_pipeline = Pipeline([
    ('selecter', DataFrameSelecter(attribute_names=['Distance'])),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])



In [11]:
full_pipeline = FeatureUnion(transformer_list=[
    ("date_pipeline", date_pipeline),
    ("discount_pipeline", discount_pipeline),
    ("merchant_pipeline", Merchant_pipeline),
    ("distance_pipeline", distance_pipeline),
])

In [12]:
y_train = train["label"]
X_train = full_pipeline.fit_transform(train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## Select and train a model
**Linear Regression--Underfitting**

In [13]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
y_train_pred = cross_val_predict(log_reg, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)



array([[1690489,       0],
       [  64395,       0]])

In [15]:
# valid predict
y_valid_pred = log_reg.predict_proba(X_train)
train_copy = train.copy()
train_copy['Probability'] = y_valid_pred[:, 1]
train_copy.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Probability
0,1439408,2632,,,0.0,NaT,2016-02-17,0,1.4e-05
1,1439408,4663,11002.0,150:20,1.0,2016-05-28,NaT,0,0.069154
2,1439408,2632,8591.0,20:1,0.0,2016-02-17,NaT,0,0.078456
3,1439408,2632,1078.0,20:1,0.0,2016-03-19,NaT,0,0.064777
4,1439408,2632,8591.0,20:1,0.0,2016-06-13,NaT,0,0.066549


In [16]:
# avgAUC calculation
vg = train_copy.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['Probability'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

0.5527043392107402


In [19]:
# test prediction for submission
y_test_pred = log_reg.predict_proba(full_pipeline.fit_transform(test))
test_copy = test[['User_id','Coupon_id','Date_received']].copy()
test_copy['Probability'] = y_test_pred[:,1]
test_copy.head(5)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,User_id,Coupon_id,Date_received,Probability
0,4129537,9983,NaT,6e-06
1,6949378,3429,2016-05-28,0.113239
2,2166529,6928,2016-02-17,0.033596
3,2166529,1808,2016-03-19,0.027505
4,6172162,6500,2016-06-13,0.039128


In [20]:
test_copy.to_csv('submit1.csv', index=False, header=False)