In [2]:
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
from scipy.stats.mstats import gmean
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from datetime import datetime
from sklearn.model_selection import train_test_split,StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
seed = 1234


# Load Data

In [3]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
ntrain = train_X.shape[0]
sub_id = test_X.listing_id.values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352L,)


In [4]:
def ET_cv(max_features = 0.5, n_estimators=300, min_samples_leaf  =1):
    est=ExtraTreesClassifier(max_features=max_features,
                             n_estimators=int(n_estimators),
                             min_samples_leaf =int(min_samples_leaf), 
                             criterion = 'entropy',
                             random_state=seed,
                             n_jobs = 6
                            )

    return cross_val_score(est,train_X,train_y, scoring = 'neg_log_loss', cv = 3).mean()

In [5]:
cv_score = -1
for x in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    score = ET_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.1 	-0.606145850928
0.2 	-0.598976213009
0.3 	-0.596018278121
0.4 	-0.594810806938
0.5 	-0.591151809197
0.6 	-0.593804569906
0.7 	-0.594340507518
0.8 	-0.594383725642
0.9 	-0.594736579705
1 	-0.642873927547


In [6]:
print max_features

0.5


In [7]:
for x in [1,2,4,8,16,32,64,128]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

1 	-0.591151809197
2 	-0.583280831735
4 	-0.577758058787
8 	-0.578656786087
16 	-0.584061718777
32 	-0.592266411972
64 	-0.602040922509
128 	-0.612803007464


In [8]:
for x in [3,5,6,7]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

3 	-0.578292154103
5 	-0.576917152211
6 	-0.576789848408
7 	-0.577561868746


In [9]:
print min_samples_leaf

6


In [10]:
def ET_blend(est, train_x, train_y, test_x, fold):
    N_params = len(est)
    print "Blend %d estimators for %d folds" % (N_params, fold)
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    
    
    for j, ester in enumerate(est):
        print "Model %d:" %(j+1)
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))

            
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print "Model %d fold %d" %(j+1,i+1)
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]            
            

            ester.fit(train_x_fold,train_y_fold)
            
            val_y_predict_fold = ester.predict_proba(val_x_fold)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print "Score: ", score
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = ester.predict_proba(test_x)
            
            print "Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start)            

        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print "Score for model %d is %f" % (j+1,np.mean(scores[:,j]))
    print "Score for blended models is %f" % (np.mean(scores))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [11]:
est = [ExtraTreesClassifier(max_features=max_features,
                            n_estimators=1000,
                            min_samples_leaf = min_samples_leaf,
                            random_state=seed,
                            criterion = 'entropy',
                            n_jobs = 6
                           )]

(train_blend_x_RFC,
 test_blend_x_RFC_mean,
 test_blend_x_RFC_gmean,
 blend_scores_RFC,
 best_rounds_RFC) = ET_blend(est,
                             train_X,train_y,
                             test_X,
                             10)

Blend 1 estimators for 10 folds
Model 1:
Model 1 fold 1
Score:  0.556161701043
Model 1 fold 1 fitting finished in 325.352s
Model 1 fold 2
Score:  0.542124875793
Model 1 fold 2 fitting finished in 341.401s
Model 1 fold 3
Score:  0.564869260906
Model 1 fold 3 fitting finished in 347.870s
Model 1 fold 4
Score:  0.546396369297
Model 1 fold 4 fitting finished in 358.871s
Model 1 fold 5
Score:  0.568791604466
Model 1 fold 5 fitting finished in 360.665s
Model 1 fold 6
Score:  0.559385475422
Model 1 fold 6 fitting finished in 366.037s
Model 1 fold 7
Score:  0.572073103473
Model 1 fold 7 fitting finished in 401.628s
Model 1 fold 8
Score:  0.584279045041
Model 1 fold 8 fitting finished in 399.332s
Model 1 fold 9
Score:  0.58056300491
Model 1 fold 9 fitting finished in 374.735s
Model 1 fold 10
Score:  0.566028800812
Model 1 fold 10 fitting finished in 381.840s
Score for model 1 is 0.564067
Score for blended models is 0.564067


In [12]:
now = datetime.now()

name_train_blend = '../blend/train_blend_ET_entropy_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_ET_entropy_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_ET_entropy_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_RFC,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_RFC, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_RFC_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_RFC_gmean, delimiter=",")

[ 0.56406732]


In [13]:
def ET_cv(max_features = 0.5, n_estimators=300, min_samples_leaf  =1):
    est=ExtraTreesClassifier(max_features=max_features,
                             n_estimators=int(n_estimators),
                             min_samples_leaf =int(min_samples_leaf), 
                             criterion = 'gini',
                             random_state=seed,
                             n_jobs = 6
                            )
    return cross_val_score(est,train_X,train_y, scoring = 'neg_log_loss', cv = 3).mean()

In [14]:
cv_score = -1
for x in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    score = ET_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.1 	-0.612052632788
0.2 	-0.604165834253
0.3 	-0.603461005474
0.4 	-0.602285482056
0.5 	-0.601341062993
0.6 	-0.602959221223
0.7 	-0.607538019436
0.8 	-0.605593609964
0.9 	-0.60469466488


In [15]:
print max_features

0.5


In [16]:
for x in [1,2,4,8,12,16,20,24]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

1 	-0.601341062993
2 	-0.586324223833
4 	-0.579769353193
8 	-0.579132526203
12 	-0.582447661746
16 	-0.585585005619
20 	-0.587480969893
24 	-0.590039070743


In [17]:
for x in [5,6,7,9,10,11]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

5 	-0.579156368162
6 	-0.578568367824
7 	-0.578673505398
9 	-0.580514684195
10 	-0.581001651326
11 	-0.581541582001


In [18]:
print min_samples_leaf

6


In [19]:
est = [ExtraTreesClassifier(max_features=max_features,
                              n_estimators=1000,
                              min_samples_leaf = min_samples_leaf,
                              random_state=seed,
                              criterion = 'gini',
                              n_jobs = 6
                             )]

(train_blend_x_RFC,
 test_blend_x_RFC_mean,
 test_blend_x_RFC_gmean,
 blend_scores_RFC,
 best_rounds_RFC) = ET_blend(est,
                              train_X,train_y,
                              test_X,
                              10)

Blend 1 estimators for 10 folds
Model 1:
Model 1 fold 1
Score:  0.558474651054
Model 1 fold 1 fitting finished in 336.085s
Model 1 fold 2
Score:  0.544352585031
Model 1 fold 2 fitting finished in 342.855s
Model 1 fold 3
Score:  0.565533127364
Model 1 fold 3 fitting finished in 335.727s
Model 1 fold 4
Score:  0.553325991806
Model 1 fold 4 fitting finished in 341.101s
Model 1 fold 5
Score:  0.570063770407
Model 1 fold 5 fitting finished in 334.438s
Model 1 fold 6
Score:  0.559893594053
Model 1 fold 6 fitting finished in 340.051s
Model 1 fold 7
Score:  0.574254170202
Model 1 fold 7 fitting finished in 340.110s
Model 1 fold 8
Score:  0.586023329822
Model 1 fold 8 fitting finished in 334.625s
Model 1 fold 9
Score:  0.582437586067
Model 1 fold 9 fitting finished in 342.305s
Model 1 fold 10
Score:  0.568207713545
Model 1 fold 10 fitting finished in 370.251s
Score for model 1 is 0.566257
Score for blended models is 0.566257


In [20]:
now = datetime.now()

name_train_blend = '../blend/train_blend_ET_gini_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_ET_gini_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_ET_gini_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_RFC,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_RFC, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_RFC_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_RFC_gmean, delimiter=",")

[ 0.56625665]


In [40]:
sub_name = '../output/sub_ET_gini_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_RFC_gmean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)