In [4]:
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
from scipy.stats.mstats import gmean
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from datetime import datetime
from sklearn.model_selection import train_test_split,StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
seed = 1234


# Load Data

In [5]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BrandenMurray_MedianBedroom.csv')
test_X = pd.read_csv(data_path + 'test_BrandenMurray_MedianBedroom.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 287) (74659, 287) (49352L,)


In [6]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.20, random_state = seed)

In [7]:
def ET_cv(max_features = 0.5, n_estimators=300, min_samples_leaf  =1):
    scores=[]
    est=ExtraTreesClassifier(max_features=max_features,
                             n_estimators=int(n_estimators),
                             min_samples_leaf =int(min_samples_leaf), 
                             criterion = 'entropy',
                             random_state=seed,
                             n_jobs = -1
                            )
    est.fit(X_train, y_train)
    y_val_pred = est.predict_proba(X_val)
    return -1*log_loss(y_val, y_val_pred)

In [9]:
cv_score = -1
for x in [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    score = ET_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.3 	-0.603937657957
0.4 	-0.622711911023
0.5 	-0.609805158959
0.6 	-0.620139916576
0.7 	-0.608383714823
0.8 	-0.617317898006
0.9 	-0.612761189877
1 	-0.658023328662


In [10]:
for x in [0.1,0.2,'auto','log2']:
    score = ET_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.1 	-0.638429411457
0.2 	-0.623223152226
auto 	-0.631299802673
log2 	-0.650444374764


In [11]:
print max_features

0.3


In [13]:
for x in [1,2,4,8,16,32,64,128]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

1 	-0.603937657957
2 	-0.576007779722
4 	-0.572413065549
8 	-0.576798293478
16 	-0.587354675884
32 	-0.599323294189
64 	-0.611293128098
128 	-0.625450065185


In [14]:
for x in [3,5,6,7,9,10,11,12]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

3 	-0.572104066681
5 	-0.571667766452
6 	-0.573685250577
7 	-0.57509504389
9 	-0.578505105907
10 	-0.580123730665
11 	-0.581393499063
12 	-0.582830807356


In [15]:
print min_samples_leaf

5


In [21]:
def ET_blend(est, train_x, train_y, test_x, fold):
    N_params = len(est)
    print "Blend %d estimators for %d folds" % (N_params, fold)
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    
    
    for j, ester in enumerate(est):
        print "Model %d:" %(j+1)
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))

            
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print "Model %d fold %d" %(j+1,i+1)
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]            
            

            ester.fit(train_x_fold,train_y_fold)
            
            val_y_predict_fold = ester.predict_proba(val_x_fold)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print "Score: ", score
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = ester.predict_proba(test_x)
            
            print "Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start)            

        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print "Score for model %d is %f" % (j+1,np.mean(scores[:,j]))
    print "Score for blended models is %f" % (np.mean(scores))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [17]:
est = [ExtraTreesClassifier(max_features=max_features,
                            n_estimators=1000,
                            min_samples_leaf = min_samples_leaf,
                            random_state=seed,
                            criterion = 'entropy',
                            n_jobs = -1
                           )]

(train_blend_x_RFC,
 test_blend_x_RFC_mean,
 test_blend_x_RFC_gmean,
 blend_scores_RFC,
 best_rounds_RFC) = ET_blend(est,
                             train_X,train_y,
                             test_X,
                             10)

Blend 1 estimators for 10 folds
Model 1:
Model 1 fold 1
('Score: ', 0.55170774316627136)
Model 1 fold 1 fitting finished in 167.387s
Model 1 fold 2
('Score: ', 0.54149776180905385)
Model 1 fold 2 fitting finished in 173.202s
Model 1 fold 3
('Score: ', 0.56723251072059366)
Model 1 fold 3 fitting finished in 172.735s
Model 1 fold 4
('Score: ', 0.54895035176984375)
Model 1 fold 4 fitting finished in 170.088s
Model 1 fold 5
('Score: ', 0.57535589214003824)
Model 1 fold 5 fitting finished in 172.863s
Model 1 fold 6
('Score: ', 0.56204278905622762)
Model 1 fold 6 fitting finished in 164.422s
Model 1 fold 7
('Score: ', 0.57410113258596085)
Model 1 fold 7 fitting finished in 163.573s
Model 1 fold 8
('Score: ', 0.58483274022333875)
Model 1 fold 8 fitting finished in 162.785s
Model 1 fold 9
('Score: ', 0.58288353973543916)
Model 1 fold 9 fitting finished in 159.840s
Model 1 fold 10
('Score: ', 0.56708969088917782)
Model 1 fold 10 fitting finished in 165.490s
Score for model 1 is 0.565569
Score f

In [18]:
test_blend_x_RFC_mean

array([[  5.46850370e-01,   3.78710881e-01,   7.44387485e-02],
       [  9.93331851e-01,   6.53604642e-03,   1.32102677e-04],
       [  9.68425959e-01,   2.73195356e-02,   4.25450540e-03],
       ..., 
       [  9.46063717e-01,   4.39288465e-02,   1.00074361e-02],
       [  9.61860896e-01,   3.76029055e-02,   5.36198801e-04],
       [  4.98460605e-01,   4.25297371e-01,   7.62420243e-02]])

In [19]:
test_blend_x_RFC_gmean

array([[ 0.54661768,  0.37837382,  0.07414224],
       [ 0.99333094,  0.00640681,  0.        ],
       [ 0.96841079,  0.02692635,  0.0041188 ],
       ..., 
       [ 0.9460548 ,  0.04385678,  0.00970935],
       [ 0.961815  ,  0.03632114,  0.        ],
       [ 0.49788434,  0.42486431,  0.07448173]])

In [20]:
now = datetime.now()

name_train_blend = '../blend/train_blend_ET_entropy_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_ET_entropy_mean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_ET_entropy_gmean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_RFC,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_RFC, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_RFC_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_RFC_gmean, delimiter=",")

[ 0.56556942]


In [25]:
def ET_cv(max_features = 0.5, n_estimators=300, min_samples_leaf  =1):
    scores=[]
    est=ExtraTreesClassifier(max_features=max_features,
                             n_estimators=int(n_estimators),
                             min_samples_leaf =int(min_samples_leaf), 
                             criterion = 'gini',
                             random_state=seed,
                             n_jobs = -1
                            )
    est.fit(X_train, y_train)
    y_val_pred = est.predict_proba(X_val)
    return -1*log_loss(y_val, y_val_pred)

In [26]:
cv_score = -1
for x in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,'auto','log2']:
    score = ET_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.1 	-0.629294760249
0.2 	-0.631772985001
0.3 	-0.628260934991
0.4 	-0.615957812746
0.5 	-0.614557935106
0.6 	-0.61590186524
0.7 	-0.629413614269
0.8 	-0.625482701518
0.9 	-0.617293433006
auto 	-0.646283785577
log2 	-0.644216588865


In [27]:
print max_features

0.5


In [29]:
for x in [1,2,4,8,12,16,20,24]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

1 	-0.614557935106
2 	-0.578045843514
4 	-0.572260661543
8 	-0.573923608629
12 	-0.578935544627
16 	-0.582588098236
20 	-0.585643672577


KeyboardInterrupt: 

In [33]:
for x in [3,5,6,7]:
    score = ET_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

3 	-0.573015767467
5 	-0.56995350617
6 	-0.57049588564
7 	-0.571851081208


In [34]:
print min_samples_leaf

5


In [37]:
est = [ExtraTreesClassifier(max_features=max_features,
                              n_estimators=1000,
                              min_samples_leaf = min_samples_leaf,
                              random_state=seed,
                              criterion = 'gini',
                              n_jobs = 4
                             )]

(train_blend_x_RFC,
 test_blend_x_RFC_mean,
 test_blend_x_RFC_gmean,
 blend_scores_RFC,
 best_rounds_RFC) = ET_blend(est,
                              train_X,train_y,
                              test_X,
                              10)

Blend 1 estimators for 10 folds
Model 1:
Model 1 fold 1
Score:  0.554186618271
Model 1 fold 1 fitting finished in 287.781s
Model 1 fold 2
Score:  0.539666939708
Model 1 fold 2 fitting finished in 310.746s
Model 1 fold 3
Score:  0.564736300085
Model 1 fold 3 fitting finished in 308.157s
Model 1 fold 4
Score:  0.546048003999
Model 1 fold 4 fitting finished in 318.666s
Model 1 fold 5
Score:  0.572164065849
Model 1 fold 5 fitting finished in 325.941s
Model 1 fold 6
Score:  0.560933121861
Model 1 fold 6 fitting finished in 323.094s
Model 1 fold 7
Score:  0.572334917682
Model 1 fold 7 fitting finished in 335.343s
Model 1 fold 8
Score:  0.583351278786
Model 1 fold 8 fitting finished in 341.519s
Model 1 fold 9
Score:  0.580610602016
Model 1 fold 9 fitting finished in 330.280s
Model 1 fold 10
Score:  0.566026340134
Model 1 fold 10 fitting finished in 337.290s
Score for model 1 is 0.564006
Score for blended models is 0.564006


In [38]:
now = datetime.now()

name_train_blend = '../blend/train_blend_ET_gini_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_ET_gini_mean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_ET_gini_gmean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_RFC,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_RFC, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_RFC_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_RFC_gmean, delimiter=",")

[ 0.56400582]


In [40]:
sub_name = '../output/sub_ET_gini_gmean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_RFC_gmean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)