In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
import random
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import gc
from scipy.stats import skew, boxcox
from scipy.stats.mstats import gmean
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BrandenMurray.csv')
test_X = pd.read_csv(data_path + 'test_BrandenMurray.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
ntrain = train_X.shape[0]
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 285) (74659, 285) (49352L,)


In [3]:
full_data=pd.concat([train_X,test_X])
features_to_use = train_X.columns.values

skewed_cols = full_data[features_to_use].apply(lambda x: skew(x.dropna()))

SSL = preprocessing.StandardScaler()
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] - full_data[skewed_col].min() + 1)
#     print skewed_col, '\t', lam
for col in features_to_use:
    full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1))
    train_X[col] = full_data.iloc[:ntrain][col]
    test_X[col] = full_data.iloc[ntrain:][col]

    
del full_data



In [4]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

(39481, 285)
(9871, 285)


In [5]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'lbfgs',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-2,3e-2,1e-1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_lbfgs = C
        best_score = score
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, (time.time()-start))
print 'C:{0:.3f}'.format(C_ovr_lbfgs)    

score: 0.607339530294	C: 0.010	Time: 20.939s
score: 0.606411472469	C: 0.030	Time: 28.818s
score: 0.606160340445	C: 0.100	Time: 49.535s
C:0.100


In [6]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'sag',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [3e-2,1e-1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_sag = C
        best_score = score    
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, (time.time()-start))
print 'C:{0:.3f}'.format(C_ovr_sag)     

score: 0.606421331593	C: 0.030	Time: 45.458s
score: 0.606149658324	C: 0.100	Time: 94.441s
C:0.100


In [7]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'newton-cg',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-2,3e-2,1e-1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_newton = C
        best_score = score      
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, time.time()-start)
print 'C:{0:.3f}'.format(C_ovr_newton)     

score: 0.607339756681	C: 0.010	Time: 19.949s
score: 0.606409001558	C: 0.030	Time: 23.836s
score: 0.606157874057	C: 0.100	Time: 33.457s
C:0.100


In [8]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'liblinear',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_liblinear = C
        best_score = score      
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, time.time()-start)
print 'C:{0:.3f}'.format(C_ovr_liblinear)      

score: 0.609210424517	C: 0.010	Time: 11.418s
score: 0.606736376324	C: 0.030	Time: 14.851s
score: 0.606199866825	C: 0.100	Time: 23.248s
score: 0.606955690143	C: 0.300	Time: 35.063s
score: 0.608300467155	C: 1.000	Time: 51.217s
C:0.100


In [9]:
logreg = LogisticRegression(multi_class = 'multinomial',solver = 'lbfgs',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_multinomial_lbfgs = C
        best_score = score    
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, time.time()-start)
print 'C:{0:.3f}'.format(C_multinomial_lbfgs)    

score: 0.603533947713	C: 0.010	Time: 41.598s
score: 0.603452939182	C: 0.030	Time: 62.785s
score: 0.604198477269	C: 0.100	Time: 92.314s
score: 0.605567583798	C: 0.300	Time: 151.256s
score: 0.607752640094	C: 1.000	Time: 227.016s
C:0.030


In [10]:
logreg = LogisticRegression(multi_class = 'multinomial',solver = 'sag',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_multinomial_sag = C
        best_score = score      
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, time.time()-start)
print 'C:{0:.3f}'.format(C_multinomial_sag)       

score: 0.6035345059	C: 0.010	Time: 19.847s
score: 0.603457205755	C: 0.030	Time: 44.085s
score: 0.604154703692	C: 0.100	Time: 102.722s
score: 0.605499331947	C: 0.300	Time: 251.266s
score: 0.607534921899	C: 1.000	Time: 698.357s
C:0.030


In [11]:
logreg = LogisticRegression(multi_class = 'multinomial',solver = 'newton-cg',
                            n_jobs = -1, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_multinomial_newton = C
        best_score = score    
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.3f}s'.format(score, C, time.time()-start)
print 'C:{0:.3f}'.format(C_multinomial_newton)     

score: 0.603527422588	C: 0.010	Time: 34.088s
score: 0.603450329015	C: 0.030	Time: 44.021s
score: 0.604181498409	C: 0.100	Time: 76.718s
score: 0.60556107096	C: 0.300	Time: 116.000s
score: 0.607652222914	C: 1.000	Time: 155.443s
C:0.030


In [12]:
def LR_blend(est, train_x, train_y, test_x, fold):
    N_params = len(est)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    
    
    for j, ester in enumerate(est):
        print ("Model %d:" %(j+1))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))

            
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]            
            

            ester.fit(train_x_fold,train_y_fold)
            
            val_y_predict_fold = ester.predict_proba(val_x_fold)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = ester.predict_proba(test_x)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            

        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [13]:
est = [LogisticRegression(C = C_ovr_lbfgs,
                          solver = 'lbfgs',
                          multi_class = 'ovr',
                          n_jobs = -1, max_iter=10000000,tol = 1e-5,
                          random_state = seed),
      LogisticRegression(C = C_ovr_sag,
                          solver = 'sag',
                          multi_class = 'ovr',
                          n_jobs = -1, max_iter=10000000,tol = 1e-5,
                          random_state = seed),
      LogisticRegression(C = C_ovr_newton,
                          solver = 'newton-cg',
                          multi_class = 'ovr',
                          n_jobs = -1, max_iter=10000000,tol = 1e-5,
                          random_state = seed),
      LogisticRegression(C = C_ovr_liblinear,
                          solver = 'liblinear',
                          multi_class = 'ovr',
                          n_jobs = -1, max_iter=10000000,tol = 1e-5,
                          random_state = seed),
      LogisticRegression(C = C_multinomial_lbfgs,
                          solver = 'lbfgs',
                          multi_class = 'multinomial',
                          n_jobs = -1, max_iter=10000000,tol = 1e-5,
                          random_state = seed),
      LogisticRegression(C = C_multinomial_sag,
                          solver = 'sag',
                          multi_class = 'multinomial',
                          n_jobs = -1, max_iter=10000000,tol = 1e-5,
                          random_state = seed),
      LogisticRegression(C = C_multinomial_newton,
                          solver = 'newton-cg',
                          multi_class = 'multinomial',
                          n_jobs = -1, max_iter=10000000,tol = 1e-7,
                          random_state = seed)]

(train_blend_x_LR,
 test_blend_x_LR_mean,
 test_blend_x_LR_gmean,
 blend_scores_LR,
 best_rounds_LR) = LR_blend(est, 
                             train_X, train_y, 
                             test_X,
                             10) #as the learning rate decreases the number of stopping rounds need to be increased



Blend 7 estimators for 10 folds
Model 1:
Model 1 fold 1
('Score: ', 0.58716933314206787)
Model 1 fold 1 fitting finished in 48.204s
Model 1 fold 2
('Score: ', 0.57548652721090132)
Model 1 fold 2 fitting finished in 60.988s
Model 1 fold 3
('Score: ', 0.6009083935438827)
Model 1 fold 3 fitting finished in 59.097s
Model 1 fold 4
('Score: ', 0.58383693096066502)
Model 1 fold 4 fitting finished in 55.266s
Model 1 fold 5
('Score: ', 0.61293538607663278)
Model 1 fold 5 fitting finished in 60.075s
Model 1 fold 6
('Score: ', 0.59124403454253738)
Model 1 fold 6 fitting finished in 57.650s
Model 1 fold 7
('Score: ', 0.59654576854108199)
Model 1 fold 7 fitting finished in 54.433s
Model 1 fold 8
('Score: ', 0.62344515463159278)
Model 1 fold 8 fitting finished in 57.143s
Model 1 fold 9
('Score: ', 0.61162546325132261)
Model 1 fold 9 fitting finished in 58.078s
Model 1 fold 10
('Score: ', 0.6149339841947159)
Model 1 fold 10 fitting finished in 55.708s
Score for model 1 is 0.599813
Model 2:
Model 2 fo

In [14]:

now = datetime.now()

name_train_blend = '../output/train_blend_LR_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_LR_mean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_LR_gmean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_LR,axis=0))
print (np.mean(best_rounds_LR,axis=0))
np.savetxt(name_train_blend,train_blend_x_LR, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_LR_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_LR_gmean, delimiter=",")

[ 0.5998131   0.59981265  0.59981346  0.59993284  0.59603874  0.59603833
  0.59603854]
[ 0.  0.  0.  0.  0.  0.  0.]


In [19]:
sub_name = '../output/sub_LR_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_LR_mean[:,-3:])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)

In [16]:
test_blend_x_LR_mean[:,-3:]

array([[  4.99825948e-01,   4.14264109e-01,   8.59099439e-02],
       [  9.61754449e-01,   3.54476789e-02,   2.79787173e-03],
       [  8.95388169e-01,   8.92753393e-02,   1.53364921e-02],
       ..., 
       [  8.98833551e-01,   8.57164167e-02,   1.54500320e-02],
       [  9.87695048e-01,   1.21304643e-02,   1.74487682e-04],
       [  6.25250605e-01,   3.37126019e-01,   3.76233760e-02]])

In [17]:
test_blend_x_LR_mean[0]

array([ 0.52045174,  0.39842677,  0.08112149,  0.52045357,  0.39843122,
        0.0811152 ,  0.52045527,  0.39843378,  0.08111095,  0.51907442,
        0.3980157 ,  0.08290988,  0.49980675,  0.41427613,  0.08591712,
        0.49982491,  0.41426329,  0.08591179,  0.49982595,  0.41426411,
        0.08590994])