In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV,StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime
from scipy.stats.mstats import gmean
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
ntrain = train_X.shape[0]
sub_id = test_X.listing_id.values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352L,)


In [3]:
full_data=pd.concat([train_X,test_X])
features_to_use = train_X.columns.values

skewed_cols = full_data[features_to_use].apply(lambda x: skew(x.dropna()))

SSL = preprocessing.StandardScaler()
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] - full_data[skewed_col].min() + 1)
#     print skewed_col, '\t', lam
for col in features_to_use:
    full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1))
    train_X[col] = full_data.iloc[:ntrain][col]
    test_X[col] = full_data.iloc[ntrain:][col]

    
del full_data



In [4]:
def KNN_cv(n_neighbors=5, leaf_size=30, min_samples_leaf  =1):
    est=KNeighborsClassifier(n_neighbors=n_neighbors,
                             weights='uniform',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            )
    return cross_val_score(est,train_X,train_y, scoring = 'neg_log_loss', cv = 3).mean()

In [5]:
cv_score = -1
for x in [2,4,8,16,32,64,128,256,512,1024]:
    score = KNN_cv(n_neighbors = x)
    if score > cv_score:
        n_neighbors = x
        cv_score = score
    print x,'\t', score

2 	-7.09210073073
4 	-3.73297381782
8 	-1.83860387785
16 	-1.04193767543
32 	-0.751189283361
64 	-0.660328991117
128 	-0.639124405776
256 	-0.642574262549
512 	-0.647533513654
1024 	-0.658912240531


In [6]:
print n_neighbors

128


In [4]:
def KNN_blend(est, train_x, train_y, test_x, fold):
    N_params = len(est)
    print "Blend %d estimators for %d folds" % (N_params, fold)
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    
    
    for j, ester in enumerate(est):
        print "Model %d:" %(j+1)
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))

            
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print "Model %d fold %d" %(j+1,i+1)
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]            
            

            ester.fit(train_x_fold,train_y_fold)
            
            val_y_predict_fold = ester.predict_proba(val_x_fold)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print "Score: ", score
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = ester.predict_proba(test_x)
            
            print "Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start)            

        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print "Score for model %d is %f" % (j+1,np.mean(scores[:,j]))
    print "Score for blended models is %f" % (np.mean(scores))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [8]:
est = [KNeighborsClassifier(n_neighbors=64,
                             weights='uniform',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=128,
                             weights='uniform',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=256,
                             weights='uniform',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=512,
                             weights='uniform',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=1024,
                             weights='uniform',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            )]

(train_blend_x_KNN,
 test_blend_x_KNN_mean,
 test_blend_x_KNN_gmean,
 blend_scores_KNN,
 best_rounds_KNN) = KNN_blend(est,
                             train_X,train_y,
                             test_X,
                             10)

Blend 5 estimators for 10 folds
Model 1:
Model 1 fold 1
Score:  0.654439692684
Model 1 fold 1 fitting finished in 718.582s
Model 1 fold 2
Score:  0.612574835236
Model 1 fold 2 fitting finished in 746.882s
Model 1 fold 3
Score:  0.660102400402
Model 1 fold 3 fitting finished in 725.755s
Model 1 fold 4
Score:  0.647354421878
Model 1 fold 4 fitting finished in 742.519s
Model 1 fold 5
Score:  0.699816061001
Model 1 fold 5 fitting finished in 749.233s
Model 1 fold 6
Score:  0.626427517339
Model 1 fold 6 fitting finished in 745.505s
Model 1 fold 7
Score:  0.63862112577
Model 1 fold 7 fitting finished in 750.755s
Model 1 fold 8
Score:  0.671911824548
Model 1 fold 8 fitting finished in 751.606s
Model 1 fold 9
Score:  0.654509098676
Model 1 fold 9 fitting finished in 741.490s
Model 1 fold 10
Score:  0.632495372862
Model 1 fold 10 fitting finished in 722.145s
Score for model 1 is 0.649825
Model 2:
Model 2 fold 1
Score:  0.631857869705
Model 2 fold 1 fitting finished in 753.191s
Model 2 fold 2
Sc

In [9]:
now = datetime.now()

name_train_blend = '../blend/train_blend_KNN_uniform_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_KNN_uniform_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_KNN_uniform_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_KNN,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_KNN, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_KNN_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_KNN_gmean, delimiter=",")

[ 0.64982524  0.63397235  0.6376017   0.6420375   0.65271384]


In [12]:
# sub_name = '../output/sub_KNN_uniform_gmean_BM_MB_add03052240_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# out_df = pd.DataFrame(test_blend_x_KNN_gmean[:,:3])
# out_df.columns = ["low", "medium", "high"]
# out_df["listing_id"] = test_X.listing_id.values
# out_df.to_csv(sub_name, index=False)

In [4]:
def KNN_cv(n_neighbors=5, leaf_size=30, min_samples_leaf  =1):
    est=KNeighborsClassifier(n_neighbors=n_neighbors,
                             weights='distance',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = -1
                            )
    return cross_val_score(est,train_X,train_y, scoring = 'neg_log_loss', cv = 3).mean()

In [16]:
cv_score = -1
for x in [2,5,10,15,20,25,30]:
    score = KNN_cv(n_neighbors = x)
    if score > cv_score:
        n_neighbors = x
        cv_score = score
    print x,'\t', score

2 	-6.74028696187
5 	-2.84410674478
10 	-1.42473827356
15 	-1.02514569526
20 	-0.852458642738
25 	-0.795439441562
30 	-0.748694567391


In [17]:
for x in [40,50,60,80,100,125,150]:
    score = KNN_cv(n_neighbors = x)
    if score > cv_score:
        n_neighbors = x
        cv_score = score
    print x,'\t', score

40 	-0.687346768991
50 	-0.664688373783
60 	-0.6557283919
80 	-0.638479123695
100 	-0.63483641848
125 	-0.634912690654
150 	-0.634423405086


In [18]:
for x in [200,250,300,350,400]:
    score = KNN_cv(n_neighbors = x)
    if score > cv_score:
        n_neighbors = x
        cv_score = score
    print x,'\t', score

200 	-0.634442797659
250 	-0.636678241098
300 	-0.638700703964
350 	-0.637935333453
400 	-0.636769025963


In [19]:
for x in [175,225]:
    score = KNN_cv(n_neighbors = x)
    if score > cv_score:
        n_neighbors = x
        cv_score = score
    print x,'\t', score

175 	-0.636009959328
225 	-0.635686166346


In [20]:
for x in [130,140,160]:
    score = KNN_cv(n_neighbors = x)
    if score > cv_score:
        n_neighbors = x
        cv_score = score
    print x,'\t', score

130 	-0.635504409
140 	-0.636607688128
160 	-0.634973522827


In [21]:
print n_neighbors

150


In [5]:
est = [KNeighborsClassifier(n_neighbors=64,
                             weights='distance',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=128,
                             weights='distance',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=256,
                             weights='distance',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=512,
                             weights='distance',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            ),
      KNeighborsClassifier(n_neighbors=1024,
                             weights='distance',
                             algorithm = 'auto', 
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs = 6
                            )]

(train_blend_x_KNN,
 test_blend_x_KNN_mean,
 test_blend_x_KNN_gmean,
 blend_scores_KNN,
 best_rounds_KNN) = KNN_blend(est,
                             train_X,train_y,
                             test_X,
                             10)

Blend 5 estimators for 10 folds
Model 1:
Model 1 fold 1
Score:  0.650497165873
Model 1 fold 1 fitting finished in 718.635s
Model 1 fold 2
Score:  0.609021997115
Model 1 fold 2 fitting finished in 723.306s
Model 1 fold 3
Score:  0.657031316184
Model 1 fold 3 fitting finished in 705.383s
Model 1 fold 4
Score:  0.643341596082
Model 1 fold 4 fitting finished in 728.448s
Model 1 fold 5
Score:  0.694507298754
Model 1 fold 5 fitting finished in 759.582s
Model 1 fold 6
Score:  0.621773534412
Model 1 fold 6 fitting finished in 784.528s
Model 1 fold 7
Score:  0.633047300826
Model 1 fold 7 fitting finished in 777.207s
Model 1 fold 8
Score:  0.668423050631
Model 1 fold 8 fitting finished in 778.556s
Model 1 fold 9
Score:  0.652535946646
Model 1 fold 9 fitting finished in 781.776s
Model 1 fold 10
Score:  0.628253396408
Model 1 fold 10 fitting finished in 775.940s
Score for model 1 is 0.645843
Model 2:
Model 2 fold 1
Score:  0.628534441397
Model 2 fold 1 fitting finished in 796.388s
Model 2 fold 2
S

In [6]:
now = datetime.now()

name_train_blend = '../blend/train_blend_KNN_distance_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_KNN_distance_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_KNN_distance_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_KNN,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_KNN, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_KNN_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_KNN_gmean, delimiter=",")

[ 0.64584326  0.63048202  0.63474597  0.63970193  0.65049482]
