In [1]:
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
from scipy.stats.mstats import gmean
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from datetime import datetime
from sklearn.model_selection import train_test_split,StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
seed = 1234


# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BrandenMurray.csv')
test_X = pd.read_csv(data_path + 'test_BrandenMurray.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 285) (74659, 285) (49352L,)


In [5]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.20, random_state = seed)

# clf = RandomForestClassifier(n_jobs = -1,n_estimators=300, criterion = 'entropy', verbose = 0)
# clf.fit(X_train, y_train)
# y_val_pred = clf.predict_proba(X_val)
# print log_loss(y_val, y_val_pred)

In [3]:
def RFC_cv(max_features = 0.5, n_estimators=300, min_samples_leaf  =1):
    scores=[]
    est=RandomForestClassifier(max_features=max_features,
                               n_estimators=int(n_estimators),
                               min_samples_leaf =int(min_samples_leaf), 
                               criterion = 'entropy',
                               random_state=seed,
                               n_jobs = -1
                              )
    est.fit(X_train, y_train)
    y_val_pred = est.predict_proba(X_val)
#     return cross_val_score(est,train_X,train_y, scoring = 'neg_log_loss', cv = 5)
    return -1*log_loss(y_val, y_val_pred)

In [6]:
cv_score = -1
for x in [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    score = RFC_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.3 	-0.578121216465
0.4 	-0.57473153549
0.5 	-0.577376388709
0.6 	-0.57646717464
0.7 	-0.585500824571
0.8 	-0.586133484854
0.9 	-0.580981742551
1 	-0.589632151169


In [10]:
print max_features

0.4


In [11]:
for x in [1,2,4,8,16,32,64,128]:
    score = RFC_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

1 	-0.57473153549
2 	-0.571688759933
4 	-0.567230635952
8 	-0.570357651708
16 	-0.574486830271
32 	-0.582130462665
64 	-0.589836034895
128 	-0.599136595922


In [12]:
for x in [3,5,6,7]:
    score = RFC_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

3 	-0.570366056219
5 	-0.568638643593
6 	-0.568598403645
7 	-0.569294080983


In [16]:
print min_samples_leaf

4


In [22]:
def RFC_blend(est, train_x, train_y, test_x, fold):
    N_params = len(est)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    
    
    for j, ester in enumerate(est):
        print ("Model %d:" %(j+1))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))

            
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]            
            

            ester.fit(train_x_fold,train_y_fold)
            
            val_y_predict_fold = ester.predict_proba(val_x_fold)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = ester.predict_proba(test_x)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            

        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [23]:
est = [RandomForestClassifier(max_features=max_features,
                              n_estimators=1000,
                              min_samples_leaf = min_samples_leaf,
                              random_state=seed,
                              criterion = 'entropy',
                              n_jobs = -1
                             )]

(train_blend_x_RFC,
 test_blend_x_RFC_mean,
 test_blend_x_RFC_gmean,
 blend_scores_RFC,
 best_rounds_RFC) = RFC_blend(est,
                              train_X,train_y,
                              test_X,
                              10)

Blend 1 estimators for 10 folds
Model 1:
Model 1 fold 1
('Score: ', 0.54855857454942647)
Model 1 fold 1 fitting finished in 222.945s
Model 1 fold 2
('Score: ', 0.53311941147018327)
Model 1 fold 2 fitting finished in 233.004s
Model 1 fold 3
('Score: ', 0.55880344668495285)
Model 1 fold 3 fitting finished in 231.613s
Model 1 fold 4
('Score: ', 0.53736370636075748)
Model 1 fold 4 fitting finished in 231.950s
Model 1 fold 5
('Score: ', 0.56584755133070141)
Model 1 fold 5 fitting finished in 233.167s
Model 1 fold 6
('Score: ', 0.55247709934800382)
Model 1 fold 6 fitting finished in 234.531s
Model 1 fold 7
('Score: ', 0.56669048224804919)
Model 1 fold 7 fitting finished in 233.572s
Model 1 fold 8
('Score: ', 0.58197117118126851)
Model 1 fold 8 fitting finished in 234.054s
Model 1 fold 9
('Score: ', 0.57888163092302869)
Model 1 fold 9 fitting finished in 228.586s
Model 1 fold 10
('Score: ', 0.56302004807477768)
Model 1 fold 10 fitting finished in 230.895s
Score for model 1 is 0.558673
Score f

In [24]:
test_blend_x_RFC_mean

array([[  5.61415829e-01,   3.52132174e-01,   8.64519966e-02],
       [  9.97842893e-01,   2.15710650e-03,   0.00000000e+00],
       [  9.22616056e-01,   6.66289502e-02,   1.07549934e-02],
       ..., 
       [  8.87102096e-01,   9.66515123e-02,   1.62463915e-02],
       [  9.52714681e-01,   4.70038511e-02,   2.81468254e-04],
       [  5.34592883e-01,   4.35178032e-01,   3.02290854e-02]])

In [25]:
test_blend_x_RFC_gmean

array([[ 0.56114519,  0.35149677,  0.08599882],
       [ 0.99784253,  0.00195752,  0.        ],
       [ 0.92250269,  0.06540104,  0.01050758],
       ..., 
       [ 0.88703631,  0.09635377,  0.01455594],
       [ 0.95267666,  0.04600396,  0.        ],
       [ 0.53411808,  0.43464971,  0.02988159]])

In [26]:
now = datetime.now()

name_train_blend = '../blend/train_blend_RFC_entropy_BM' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_RFC_entropy_mean_BM' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_RFC_entropy_gmean_BM' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_RFC,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_RFC, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_RFC_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_RFC_gmean, delimiter=",")

[ 0.55867331]


In [28]:
sub_name = '../output/sub_RFC_entropy_gmean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_RFC_gmean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)

In [29]:
def RFC_cv(max_features = 0.5, n_estimators=300, min_samples_leaf  =1):
    scores=[]
    est=RandomForestClassifier(max_features=max_features,
                               n_estimators=int(n_estimators),
                               min_samples_leaf =int(min_samples_leaf), 
                               criterion = 'gini',
                               random_state=seed,
                               n_jobs = -1
                              )
    est.fit(X_train, y_train)
    y_val_pred = est.predict_proba(X_val)
#     return cross_val_score(est,train_X,train_y, scoring = 'neg_log_loss', cv = 5)
    return -1*log_loss(y_val, y_val_pred)

In [30]:
cv_score = -1
for x in [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    score = RFC_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.3 	-0.580311808983
0.4 	-0.581610017782
0.5 	-0.588300355567
0.6 	-0.583919290736
0.7 	-0.596547574466
0.8 	-0.592115941543
0.9 	-0.58970988735
1 	-0.590757943676


In [34]:
for x in [0.1,0.2]:
    score = RFC_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.1 	-0.567717755864
0.2 	-0.573311070816


In [39]:
for x in [0.03,0.06]:
    score = RFC_cv(max_features = x)
    if score > cv_score:
        max_features = x
        cv_score = score
    print x,'\t', score

0.03 	-0.582249595103
0.06 	-0.574039720103


In [40]:
print max_features

0.1


In [36]:
for x in [1,2,4,8]:
    score = RFC_cv(max_features = max_features,min_samples_leaf = x)
    if score > cv_score:
        min_samples_leaf = x
        cv_score = score    
    print x, '\t', score

1 	-0.567717755864
2 	-0.566678139402
4 	-0.569112200834
8 	-0.575282357779
16 	-0.583597632653
32 	-0.592794220796


In [37]:
print min_samples_leaf

2


In [41]:
est = [RandomForestClassifier(max_features=max_features,
                              n_estimators=1000,
                              min_samples_leaf = min_samples_leaf,
                              random_state=seed,
                              criterion = 'gini',
                              n_jobs = -1
                             )]

(train_blend_x_RFC,
 test_blend_x_RFC_mean,
 test_blend_x_RFC_gmean,
 blend_scores_RFC,
 best_rounds_RFC) = RFC_blend(est,
                              train_X,train_y,
                              test_X,
                              10)

Blend 1 estimators for 10 folds
Model 1:
Model 1 fold 1
('Score: ', 0.54927278962183534)
Model 1 fold 1 fitting finished in 62.242s
Model 1 fold 2
('Score: ', 0.53460799445798934)
Model 1 fold 2 fitting finished in 62.976s
Model 1 fold 3
('Score: ', 0.56121626776110223)
Model 1 fold 3 fitting finished in 63.469s
Model 1 fold 4
('Score: ', 0.54123921531424113)
Model 1 fold 4 fitting finished in 64.828s
Model 1 fold 5
('Score: ', 0.56682028268092821)
Model 1 fold 5 fitting finished in 63.438s
Model 1 fold 6
('Score: ', 0.55617906749769885)
Model 1 fold 6 fitting finished in 63.752s
Model 1 fold 7
('Score: ', 0.57014254294779976)
Model 1 fold 7 fitting finished in 65.716s
Model 1 fold 8
('Score: ', 0.58148600136509343)
Model 1 fold 8 fitting finished in 72.534s
Model 1 fold 9
('Score: ', 0.57878197693189148)
Model 1 fold 9 fitting finished in 70.321s
Model 1 fold 10
('Score: ', 0.5626029691003962)
Model 1 fold 10 fitting finished in 65.271s
Score for model 1 is 0.560235
Score for blended 

In [42]:
now = datetime.now()

name_train_blend = '../blend/train_blend_RFC_gini_BM' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_RFC_gini_mean_BM' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_RFC_gini_gmean_BM' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_RFC,axis=0))
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,train_blend_x_RFC, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_RFC_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_RFC_gmean, delimiter=",")

[ 0.56023491]
