In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./ml100marathon-02-01/"

In [2]:
train = pd.read_csv(os.path.join(DATA_ROOT,'train.csv'))
valid = pd.read_csv(os.path.join(DATA_ROOT,'valid.csv'))
print(train.shape)
print(train.head())
print(valid.shape)
print(valid.head())

(667753, 22)
   User_id  Merchant_id  Coupon_id Discount_rate  Distance  Date_received  \
0  1439408         2632     8591.0          20:1       0.0     20160217.0   
1  1439408         2632     1078.0          20:1       0.0     20160319.0   
2  2029232         3381    11951.0        200:20       1.0     20160129.0   
3  2223968         3381     9776.0          10:5       2.0     20160129.0   
4    73611         2099    12034.0        100:10      99.0     20160207.0   

   Date  label  weekday  weekday_type  ...  weekday_3  weekday_4  weekday_5  \
0   NaN      0      3.0             0  ...          1          0          0   
1   NaN      0      6.0             0  ...          0          0          0   
2   NaN      0      5.0             0  ...          0          0          1   
3   NaN      0      5.0             0  ...          0          0          1   
4   NaN      0      7.0             0  ...          0          0          0   

   weekday_6  weekday_7  discount_rate  discount_

In [3]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
original_feature = [ 'discount_rate',
                            'discount_type',
                            'discount_man', 
                            'discount_jian',
                            'Distance', 
                            'weekday', 
                            'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)
predictors = original_feature

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [4]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(train[predictors].astype(float))
X_test = sc_X.transform(valid[predictors].astype(float))

In [5]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)
#pca = PCA(n_components = 7)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print(pd.DataFrame(explained_variance))

               0
0   1.720632e-01
1   1.507219e-01
2   9.997941e-02
3   9.361897e-02
4   8.910377e-02
5   8.684127e-02
6   8.536624e-02
7   8.524878e-02
8   7.542036e-02
9   5.508153e-02
10  6.554548e-03
11  3.451848e-29
12  7.429750e-30
13  5.180427e-35


In [6]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial', n_jobs=4, max_iter=1000)
classifier.fit(X_train, train['label'])

In [7]:
# Fitting Random Forest to the Training set
#from sklearn.ensemble import RandomForestClassifier

#classifier = RandomForestClassifier(
        #n_estimators = 100, 
        #max_depth = 10,
        #min_samples_split = 75,
        #min_samples_leaf =10,
        #max_features = 10,
        #criterion = 'entropy',
        #oob_score = True,
        #n_jobs = 4,
        #verbose = 1
    #)
#classifier.fit(X_train, train['label'])

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    4.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [8]:
# Predicting the Test set results
valid_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(valid['label'], valid_pred)
print(cm)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


[[75356    28]
 [ 3830     2]]


In [9]:
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)

targetset = dftest.copy()
print(targetset.shape)
print(targetset.head())

targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()
y_test = pca.transform(testset[predictors])

y_test_pred = classifier.predict_proba(y_test)
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)
print(test1.head())

(306313, 19)
   User_id  Merchant_id  Coupon_id Discount_rate  Distance  Date_received  \
0  1439408         4663    11002.0        150:20       1.0     20160528.0   
1  1439408         2632     8591.0          20:1       0.0     20160613.0   
2  1439408         2632     8591.0          20:1       0.0     20160516.0   
3  2029232          450     1532.0          30:5       0.0     20160530.0   
4  2029232         6459    12737.0          20:1       0.0     20160519.0   

   weekday  weekday_type  weekday_1  weekday_2  weekday_3  weekday_4  \
0        6             0          0          0          0          0   
1        1             0          1          0          0          0   
2        1             0          1          0          0          0   
3        1             0          1          0          0          0   
4        4             0          0          0          0          1   

   weekday_5  weekday_6  weekday_7  discount_rate  discount_man  \
0          0          1 

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


(306313, 15)
   discount_rate  discount_type  discount_man  discount_jian  Distance  \
0       0.866667              1           150             20       1.0   
1       0.950000              1            20              1       0.0   
2       0.950000              1            20              1       0.0   
3       0.833333              1            30              5       0.0   
4       0.950000              1            20              1       0.0   

   weekday  weekday_type  weekday_1  weekday_2  weekday_3  weekday_4  \
0        6             0          0          0          0          0   
1        1             0          1          0          0          0   
2        1             0          1          0          0          0   
3        1             0          1          0          0          0   
4        4             0          0          0          0          1   

   weekday_5  weekday_6  weekday_7  pred_prob  
0          0          1          0   0.086022  
1          0 

In [10]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)
print(output.head())

(306313, 4)
   User_id Coupon_id Date_received  pred_prob                     uid
0  1439408     11002      20160528   0.086022  1439408_11002_20160528
1  1439408      8591      20160613   0.013800   1439408_8591_20160613
2  1439408      8591      20160516   0.013800   1439408_8591_20160516
3  2029232      1532      20160530   0.054679   2029232_1532_20160530
4  2029232     12737      20160519   0.136206  2029232_12737_20160519


In [11]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv(os.path.join(DATA_ROOT,"results.csv"), header=["uid", "label"], index=False) # submission format
print(out.head())

                     uid     label
0  1000020_2705_20160519  0.063223
1  1000020_8192_20160513  0.065458
2  1000065_1455_20160527  0.054612
3  1000085_8067_20160513  0.023103
4  1000086_2418_20160613  0.054679
