In [2]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./ml100marathon-02-01/"

In [3]:
train = pd.read_csv(os.path.join(DATA_ROOT,'train.csv'))
valid = pd.read_csv(os.path.join(DATA_ROOT,'valid.csv'))
print(train.shape)
print(train.head())
print(valid.shape)
print(valid.head())

(667753, 22)
   User_id  Merchant_id  Coupon_id Discount_rate  Distance  Date_received  \
0  1439408         2632     8591.0          20:1       0.0     20160217.0   
1  1439408         2632     1078.0          20:1       0.0     20160319.0   
2  2029232         3381    11951.0        200:20       1.0     20160129.0   
3  2223968         3381     9776.0          10:5       2.0     20160129.0   
4    73611         2099    12034.0        100:10      99.0     20160207.0   

   Date  label  weekday  weekday_type  ...  weekday_3  weekday_4  weekday_5  \
0   NaN      0      3.0             0  ...          1          0          0   
1   NaN      0      6.0             0  ...          0          0          0   
2   NaN      0      5.0             0  ...          0          0          1   
3   NaN      0      5.0             0  ...          0          0          1   
4   NaN      0      7.0             0  ...          0          0          0   

   weekday_6  weekday_7  discount_rate  discount_

In [4]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
original_feature = [ 'discount_rate',
                            'discount_type',
                            'discount_man', 
                            'discount_jian',
                            'Distance', 
                            'weekday', 
                            'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [5]:
predictors = original_feature

# Fitting Random Forest to the Training set
from sklearn.ensemble import RandomForestClassifier

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='l2', 
        fit_intercept=True, 
        max_iter=1350,
        alpha=0.001,
        l1_ratio=0.01,
        eta0=0.0,
        average=False,
        shuffle=True, 
        n_jobs=4,
        n_iter=5,
        class_weight=None,
        early_stopping=False,
        epsilon=0.1,
        learning_rate='optimal', 
        power_t=0.5,
        random_state=None,
        warm_start=False,
        tol=0.001,
        verbose=1
    )
    
    classifier2 = lambda: RandomForestClassifier(
        n_estimators = 1000, 
        max_depth = 10,
        min_samples_split = 75,
        min_samples_leaf =10,
        #max_features = 10,
        criterion = 'entropy',
        oob_score = True,
        n_jobs = 4,
        verbose = 1
    )
    
    classifier3 = lambda: LogisticRegression(
        C=1.0,
        max_iter=1000,
        class_weight=None,
        fit_intercept=True, 
        penalty='l2', 
        n_jobs = 4,
        tol=0.001,
        verbose = 1
    )

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        #'en__alpha': [ 0.001, 0.01, 0.1],
        #'en__l1_ratio': [ 0.001, 0.01, 0.1],
        #'en__max_iter': range(100, 2000, 250)
        #RF
        #'en__n_estimators': range(100, 200, 20),
        #'en__max_depth': range(10, 100, 10), 
        #'en__min_samples_split': range(50, 200, 25),
        #'en__min_samples_leaf':range(10,100,10)
        #'en__max_features': range(3, 14, 2)
    }

    folder = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1,
        scoring='roc_auc')
    grid_search = grid_search.fit(data[predictors], data['label'])
    print(grid_search.best_params_, grid_search.best_score_)
    
    return grid_search

In [8]:
print(train.head())
print(predictors)
#model = check_model(train, predictors)

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
model = Sequential()

# Adding the input layer and the first hidden layer
model.add(Dense(units = len(predictors), kernel_initializer = 'uniform', activation = 'relu', input_dim =  len(predictors)))

# Adding the second hidden layer
model.add(Dense(units = len(predictors), kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
model.fit(train[predictors], train['label'], batch_size = 100, epochs = 10)

   User_id  Merchant_id  Coupon_id Discount_rate  Distance  Date_received  \
0  1439408         2632     8591.0          20:1       0.0     20160217.0   
1  1439408         2632     1078.0          20:1       0.0     20160319.0   
2  2029232         3381    11951.0        200:20       1.0     20160129.0   
3  2223968         3381     9776.0          10:5       2.0     20160129.0   
4    73611         2099    12034.0        100:10      99.0     20160207.0   

   Date  label  weekday  weekday_type  ...  weekday_3  weekday_4  weekday_5  \
0   NaN      0      3.0             0  ...          1          0          0   
1   NaN      0      6.0             0  ...          0          0          0   
2   NaN      0      5.0             0  ...          0          0          1   
3   NaN      0      5.0             0  ...          0          0          1   
4   NaN      0      7.0             0  ...          0          0          0   

   weekday_6  weekday_7  discount_rate  discount_man  discount

<keras.callbacks.History at 0x29b72faa828>

In [12]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
#print(y_valid_pred[:10])
#valid1['pred_prob'] = y_valid_pred[:, 1]
valid1['pred_prob'] = y_valid_pred[:]
print(valid1.head())

   User_id  Merchant_id  Coupon_id Discount_rate  Distance  Date_received  \
0  1832624         3381     7610.0        200:20       0.0     20160429.0   
1   163606         1569     5054.0        200:30      10.0     20160421.0   
2  4061024         3381     7610.0        200:20      10.0     20160426.0   
3   106443          450     3732.0          30:5      99.0     20160429.0   
4   114747         1569     5054.0        200:30       9.0     20160426.0   

   Date  label  weekday  weekday_type  ...  weekday_4  weekday_5  weekday_6  \
0   NaN      0      5.0             0  ...          0          1          0   
1   NaN      0      4.0             0  ...          1          0          0   
2   NaN      0      2.0             0  ...          0          0          0   
3   NaN      0      5.0             0  ...          0          1          0   
4   NaN      0      2.0             0  ...          0          0          0   

   weekday_7  discount_rate  discount_man  discount_jian  disc

In [14]:
from sklearn.metrics import roc_auc_score, accuracy_score
#auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.778, Accuracy: 0.952


In [16]:
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)

targetset = dftest.copy()
print(targetset.shape)

targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
#test1['pred_prob'] = y_test_pred[:, 1]
test1['pred_prob'] = y_test_pred[:]
print(test1.shape)

(306313, 19)
(306313, 15)


In [17]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)
print(output.head())

(306313, 4)
   User_id Coupon_id Date_received  pred_prob                     uid
0  1439408     11002      20160528   0.017568  1439408_11002_20160528
1  1439408      8591      20160613   0.123949   1439408_8591_20160613
2  1439408      8591      20160516   0.123949   1439408_8591_20160516
3  2029232      1532      20160530   0.140006   2029232_1532_20160530
4  2029232     12737      20160519   0.151966  2029232_12737_20160519


In [18]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv(os.path.join(DATA_ROOT,"results.csv"), header=["uid", "label"], index=False) # submission format
print(out.head())

                     uid     label
0  1000020_2705_20160519  0.193877
1  1000020_8192_20160513  0.153006
2  1000065_1455_20160527  0.086505
3  1000085_8067_20160513  0.106734
4  1000086_2418_20160613  0.140006
