In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
import random
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import gc
from scipy.stats import skew, boxcox
from scipy.stats.mstats import gmean
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
ntrain = train_X.shape[0]
sub_id = test_X.listing_id.values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352L,)


In [3]:
time_feature = pd.read_csv(data_path + 'listing_image_time.csv')
time_feature.columns = ['listing_id','time_stamp']
train_X = train_X.merge(time_feature,on='listing_id',how='left')
test_X = test_X.merge(time_feature,on='listing_id',how='left')

print train_X.shape
print test_X.shape

(49352, 323)
(74659, 323)


In [4]:
full_data=pd.concat([train_X,test_X])
features_to_use = train_X.columns.values

skewed_cols = full_data[features_to_use].apply(lambda x: skew(x.dropna()))

SSL = preprocessing.StandardScaler()
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] - full_data[skewed_col].min() + 1)
#     print skewed_col, '\t', lam
for col in features_to_use:
    full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1))
    train_X[col] = full_data.iloc[:ntrain][col]
    test_X[col] = full_data.iloc[ntrain:][col]

    
del full_data



In [4]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

(39481, 322)
(9871, 322)


In [5]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'lbfgs',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1,3]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_lbfgs = C
        best_score = score
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_ovr_lbfgs)    

score: 0.607441650321	C: 0.001	Time: 0.3min
score: 0.602299176254	C: 0.003	Time: 0.5min
score: 0.600219508974	C: 0.010	Time: 0.8min
score: 0.599425613119	C: 0.030	Time: 1.2min
score: 0.599538246159	C: 0.100	Time: 1.9min
score: 0.600854537522	C: 0.300	Time: 2.8min
score: 0.60265340933	C: 1.000	Time: 4.4min
score: 0.604083830029	C: 3.000	Time: 5.4min
C:0.030


In [6]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'sag',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1,3]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_sag = C
        best_score = score    
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_ovr_sag)     

score: 0.607440212881	C: 0.001	Time: 0.1min
score: 0.60229957472	C: 0.003	Time: 0.1min
score: 0.600224624787	C: 0.010	Time: 0.3min
score: 0.599436395677	C: 0.030	Time: 0.9min
score: 0.599522410698	C: 0.100	Time: 2.1min
score: 0.600747665864	C: 0.300	Time: 4.3min
score: 0.602525576646	C: 1.000	Time: 8.4min
score: 0.603449824267	C: 3.000	Time: 13.6min
score: 0.604114104	C: 10.000	Time: 14.0min
C:0.030


In [7]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'newton-cg',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_newton = C
        best_score = score      
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_ovr_newton)     

score: 0.607439473809	C: 0.001	Time: 0.4min
score: 0.602297376877	C: 0.003	Time: 0.5min
score: 0.600221208788	C: 0.010	Time: 0.6min
score: 0.599431156636	C: 0.030	Time: 0.9min
score: 0.599543550764	C: 0.100	Time: 1.2min
score: 0.600847368746	C: 0.300	Time: 1.7min
score: 0.602621046919	C: 1.000	Time: 2.8min
C:0.030


In [8]:
logreg = LogisticRegression(multi_class = 'ovr',solver = 'liblinear',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_ovr_liblinear = C
        best_score = score      
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_ovr_liblinear)      

score: 0.637789283435	C: 0.001	Time: 0.1min
score: 0.61187146867	C: 0.003	Time: 0.2min
score: 0.60219040783	C: 0.010	Time: 0.3min
score: 0.599778339024	C: 0.030	Time: 0.4min
score: 0.599588403551	C: 0.100	Time: 0.6min
score: 0.600837392602	C: 0.300	Time: 0.9min
score: 0.602611227563	C: 1.000	Time: 1.2min
C:0.100


In [9]:
logreg = LogisticRegression(multi_class = 'multinomial',solver = 'lbfgs',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_multinomial_lbfgs = C
        best_score = score    
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_multinomial_lbfgs)    

score: 0.599532004941	C: 0.001	Time: 0.3min
score: 0.596628363578	C: 0.003	Time: 0.4min
score: 0.595804094488	C: 0.010	Time: 0.7min
score: 0.595786124507	C: 0.030	Time: 1.1min
score: 0.596885316922	C: 0.100	Time: 1.6min
score: 0.598492150375	C: 0.300	Time: 2.6min
score: 0.600523460812	C: 1.000	Time: 4.0min
C:0.030


In [10]:
logreg = LogisticRegression(multi_class = 'multinomial',solver = 'sag',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_multinomial_sag = C
        best_score = score      
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_multinomial_sag)       

score: 0.599530446452	C: 0.001	Time: 0.1min
score: 0.596635438637	C: 0.003	Time: 0.2min
score: 0.59580380439	C: 0.010	Time: 0.5min
score: 0.595785644181	C: 0.030	Time: 1.2min
score: 0.596844285732	C: 0.100	Time: 2.8min
score: 0.598410461283	C: 0.300	Time: 5.0min
score: 0.599961322512	C: 1.000	Time: 7.3min
C:0.030


In [11]:
logreg = LogisticRegression(multi_class = 'multinomial',solver = 'newton-cg',
                            n_jobs = 6, max_iter=10000000,tol = 1e-4,
                            random_state = seed)
best_score = 100
for C in [1e-3,3e-3,1e-2,3e-2,1e-1,3e-1,1]:
    start = time.time()
    logreg.set_params(**{'C': C})
    logreg.fit(X_train,y_train)
    pred_y = logreg.predict_proba(X_val)
    score = log_loss(y_val, pred_y)
    if score < best_score:
        C_multinomial_newton = C
        best_score = score    
    print 'score: {0}\tC: {1:.3f}\tTime: {2:.1f}min'.format(score, C, (time.time()-start)/60)
print 'C:{0:.3f}'.format(C_multinomial_newton)     

score: 0.599531042992	C: 0.001	Time: 0.4min
score: 0.596633476024	C: 0.003	Time: 0.5min
score: 0.595802096672	C: 0.010	Time: 0.6min
score: 0.595787123687	C: 0.030	Time: 0.9min
score: 0.596880941226	C: 0.100	Time: 1.5min
score: 0.598460730286	C: 0.300	Time: 2.3min
score: 0.6004649384	C: 1.000	Time: 3.6min
C:0.030


In [5]:
def LR_blend(est, train_x, train_y, test_x, fold):
    N_params = len(est)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    
    
    for j, ester in enumerate(est):
        print ("Model %d:" %(j+1))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))

            
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]            
            

            ester.fit(train_x_fold,train_y_fold)
            
            val_y_predict_fold = ester.predict_proba(val_x_fold)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = ester.predict_proba(test_x)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            

        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [7]:
est = [LogisticRegression(C = 0.030,
                          solver = 'lbfgs',
                          multi_class = 'multinomial',
                          n_jobs = 6, max_iter=10000000,tol = 1e-7,
                          random_state = seed),]

(train_blend_x_LR,
 test_blend_x_LR_mean,
 test_blend_x_LR_gmean,
 blend_scores_LR,
 best_rounds_LR) = LR_blend(est, 
                             train_X, train_y, 
                             test_X,
                             5) #as the learning rate decreases the number of stopping rounds need to be increased



Blend 1 estimators for 5 folds
Model 1:
Model 1 fold 1
('Score: ', 0.55651923438442019)
Model 1 fold 1 fitting finished in 61.522s
Model 1 fold 2
('Score: ', 0.56003855948810499)
Model 1 fold 2 fitting finished in 62.697s
Model 1 fold 3
('Score: ', 0.57988217922476992)
Model 1 fold 3 fitting finished in 64.531s
Model 1 fold 4
('Score: ', 0.59871670602243976)
Model 1 fold 4 fitting finished in 61.646s
Model 1 fold 5
('Score: ', 0.60288089749911122)
Model 1 fold 5 fitting finished in 66.374s
Score for model 1 is 0.579608
Score for blended models is 0.579608


In [8]:

now = datetime.now()

name_train_blend = '../output/train_blend_LR_last_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_LR_mean_last_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_LR_gmean_last_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_LR,axis=0))
# print (np.mean(best_rounds_LR,axis=0))
np.savetxt(name_train_blend,train_blend_x_LR, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_LR_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_LR_gmean, delimiter=",")

[ 0.57960752]


In [15]:
sub_name = '../output/sub_LR_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_LR_mean[:,-3:])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [16]:
test_blend_x_LR_mean[:,-3:]

array([[  4.99825948e-01,   4.14264109e-01,   8.59099439e-02],
       [  9.61754449e-01,   3.54476789e-02,   2.79787173e-03],
       [  8.95388169e-01,   8.92753393e-02,   1.53364921e-02],
       ..., 
       [  8.98833551e-01,   8.57164167e-02,   1.54500320e-02],
       [  9.87695048e-01,   1.21304643e-02,   1.74487682e-04],
       [  6.25250605e-01,   3.37126019e-01,   3.76233760e-02]])

In [17]:
test_blend_x_LR_mean[0]

array([ 0.52045174,  0.39842677,  0.08112149,  0.52045357,  0.39843122,
        0.0811152 ,  0.52045527,  0.39843378,  0.08111095,  0.51907442,
        0.3980157 ,  0.08290988,  0.49980675,  0.41427613,  0.08591712,
        0.49982491,  0.41426329,  0.08591179,  0.49982595,  0.41426411,
        0.08590994])

In [None]:
# [ 0.5998131   0.59981265  0.59981346  0.59993284  0.59603874  0.59603833
#   0.59603854]