# Gradient Boosting  + 'features'

## Train set

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingClassifier
import cPickle
import warnings
warnings.filterwarnings("ignore")
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from time import time

pd.options.display.max_columns = 160

In [2]:
# all_df = pd.read_json('train.json')
all_df = pd.read_csv('train_feats_max_desc.csv')

In [3]:
all_df.shape

(49352, 83)

In [4]:
all_df.head(2)

Unnamed: 0,index,bathrooms,bedrooms,created,interest_level,latitude,listing_id,longitude,price,num_photos,elevator,hardwood floors,doorman,dishwasher,laundry in building,no fee,fitness center,pre-war,roof deck,outdoor space,dining room,high speed internet,balcony,swimming pool,new construction,terrace,exclusive,loft,garden/patio,wheelchair access,fireplace,simplex,lowrise,garage,reduced fee,furnished,multi-level,high ceilings,private outdoor space,parking space,live in super,renovated,green building,storage,stainless steel appliances,light,granite kitchen,bike room,exposed brick,marble bath,pets on approval,walk in closet(s),valet,subway,residents lounge,highrise,short term allowed,childrens playroom,no pets,duplex,actual apt. photos,central a/c,view,live/work,virtual doorman,sauna,microwave,shares ok,post-war,brownstone,business center,sublet,midrise,pet friendly,guarantors accepted,attended lobby,package room,video intercom,community recreation facilities,skylight,flex-2,cable/satellite tv,all utilities included
0,10,1.5,3,1466755000.0,medium,40.7145,7211212,-73.9425,3000,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,10000,1.0,2,1465734000.0,low,40.7947,7150865,-73.9667,5465,11,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [5]:
x_train, x_val, y_train, y_val = train_test_split(all_df.drop(['interest_level'], 1),all_df[['interest_level']], test_size=0.2, random_state=42)


In [6]:
cat_feats = cPickle.load(open('cat_feats.p', 'rb'))

for col in ['interest_level']:
    y_train[col] = y_train[col].astype('category')
    y_val[col] = y_val[col].astype('category')
    
for col in cat_feats:
    x_train[col] = x_train[col].astype('category')
    x_val[col] = x_val[col].astype('category')

In [7]:
drop_list = [u'listing_id', 'index']
x_train_small = x_train.drop(drop_list,1)

x_val_small = x_val.drop(drop_list,1)

In [8]:
# low_pvalues_cols = cPickle.load(open('low_pvalues_cols.p', 'rb'))
x_train_best = x_train_small#[low_pvalues_cols]
x_val_best = x_val_small#[low_pvalues_cols]
x_train_best.shape
# x_train_best.head(2)

(39481, 80)

In [9]:
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=4, max_features=0.3, random_state=0)
# n_estimator: 500; learning_rate: 0.2; max_features: 0.4; max_depth: 3
model = model.fit(x_train_best, y_train)

KeyboardInterrupt: 

In [28]:
predicted_train = pd.DataFrame(model.predict_proba(x_train_best))
# predicted = model.predict_proba(x)
predicted_train.columns = ['high', 'low', 'medium']
predicted_train.head()
# predicted

Unnamed: 0,high,low,medium
0,0.005865,0.918649,0.075486
1,0.001084,0.977797,0.021118
2,0.052934,0.585058,0.362009
3,0.051329,0.711687,0.236984
4,0.014158,0.906652,0.07919


In [29]:
log_loss_train = log_loss(y_train, predicted_train.as_matrix())
log_loss_train

0.49517175136735192

In [30]:
predicted_val = pd.DataFrame(model.predict_proba(x_val_best))
# predicted = model.predict_proba(x)
predicted_val.columns = ['high', 'low', 'medium']
predicted_val.head()
# predicted

Unnamed: 0,high,low,medium
0,0.068354,0.553789,0.377858
1,0.052149,0.663396,0.284455
2,0.0354,0.727591,0.23701
3,0.063397,0.674106,0.262497
4,0.008215,0.908942,0.082843


In [31]:
log_loss_val = log_loss(y_val, predicted_val.as_matrix())
log_loss_val

0.58950330718593591

In [118]:
a = zip(x_train_best.columns, list(model.feature_importances_))
a.sort(key = lambda t: t[1], reverse=True)
# a

In [10]:
# bad_feats_gb = [i[0] for i in a[-20:]]
# cPickle.dump(bad_feats_gb, open('bad_feats_gb.p', 'wb')) 
bad_feats_gb = cPickle.load(open('bad_feats_gb.p', 'rb'))
# bad_feats_gb

In [11]:
x_train_best.drop(bad_feats_gb, axis=1, inplace=True)
x_val_best.drop(bad_feats_gb, axis=1, inplace=True)

In [12]:
x_train_best.shape, x_val_best.shape

((39481, 60), (9871, 60))

## Randomized Search

In [13]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
log_scoring

make_scorer(log_loss, greater_is_better=False, needs_proba=True)

In [23]:
clf = GradientBoostingClassifier()
# specify parameters and distributions to sample from
param_dist = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8],
              "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
              "n_estimators": [300, 400, 500, 600, 700, 800],
              "learning_rate": [round(i,2) for i in np.arange(0.05,0.2,0.01)]
             "min_samples_split": [None, 100, 300, 400, 500, 600, 800, 1000]
             "min_samples_leaf": [None, 20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, n_jobs=-1, scoring=log_scoring, verbose=1)

In [24]:
start = time()
search = random_search.fit(x_train_best, y_train['interest_level'])
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 36.8min finished


RandomizedSearchCV took 2278.45 seconds for 50 candidates parameter settings.


In [25]:
search.grid_scores_

[mean: -0.60351, std: 0.00468, params: {'n_estimators': 100, 'max_features': 0.5, 'learning_rate': 0.4, 'max_depth': 3},
 mean: -0.60840, std: 0.00507, params: {'n_estimators': 500, 'max_features': 'sqrt', 'learning_rate': 0.4, 'max_depth': 3},
 mean: -0.60283, std: 0.00531, params: {'n_estimators': 300, 'max_features': 0.5, 'learning_rate': 0.3, 'max_depth': 2},
 mean: -0.66456, std: 0.00318, params: {'n_estimators': 100, 'max_features': 0.4, 'learning_rate': 0.2, 'max_depth': 1},
 mean: -0.62078, std: 0.00342, params: {'n_estimators': 100, 'max_features': 'sqrt', 'learning_rate': 0.2, 'max_depth': 4},
 mean: -0.61199, std: 0.00238, params: {'n_estimators': 100, 'max_features': 0.2, 'learning_rate': 0.3, 'max_depth': 3},
 mean: -0.62395, std: 0.00457, params: {'n_estimators': 100, 'max_features': 0.2, 'learning_rate': 0.2, 'max_depth': 3},
 mean: -0.59802, std: 0.00361, params: {'n_estimators': 300, 'max_features': 0.3, 'learning_rate': 0.3, 'max_depth': 3},
 mean: -0.64229, std: 0.00

In [26]:
search.best_score_, search.best_estimator_, search.best_params_

(-0.5933256136564423,
 GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=4,
               max_features=0.3, max_leaf_nodes=None,
               min_impurity_split=1e-07, min_samples_leaf=1,
               min_samples_split=2, min_weight_fraction_leaf=0.0,
               n_estimators=500, presort='auto', random_state=None,
               subsample=1.0, verbose=0, warm_start=False),
 {'learning_rate': 0.1,
  'max_depth': 4,
  'max_features': 0.3,
  'n_estimators': 500})

In [149]:
x_train_best.shape, y_train.shape

((39481, 60), (39481, 1))

In [203]:
help(search)

Help on RandomizedSearchCV in module sklearn.grid_search object:

class RandomizedSearchCV(BaseSearchCV)
 |  Randomized search on hyper parameters.
 |  
 |  .. deprecated:: 0.18
 |      This module will be removed in 0.20.
 |      Use :class:`sklearn.model_selection.RandomizedSearchCV` instead.
 |  
 |  RandomizedSearchCV implements a "fit" and a "score" method.
 |  It also implements "predict", "predict_proba", "decision_function",
 |  "transform" and "inverse_transform" if they are implemented in the
 |  estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated search over parameter settings.
 |  
 |  In contrast to GridSearchCV, not all parameter values are tried out, but
 |  rather a fixed number of parameter settings is sampled from the specified
 |  distributions. The number of parameter settings that are tried is
 |  given by n_iter.
 |  
 |  If all parameters are presented as a list,
 |  sampling without replacemen

## Cross Validation

In [152]:
def get_cv_loss(ne, lr, md, mf):
    kf = KFold(x_train_best.shape[0], n_folds=5, random_state=2017)
    loss_tr = []
    loss_ts = []
    for train_index, test_index in kf:
        x_tr = x_train_best.reset_index().loc[train_index].set_index(['index'])
        y_tr =  y_train.reset_index().loc[train_index].set_index(['index'])
        x_ts = x_train_best.reset_index().loc[test_index].set_index(['index'])
        y_ts = y_train.reset_index().loc[test_index].set_index(['index'])

        model = GradientBoostingClassifier(n_estimators=ne, learning_rate=lr, max_depth=md, max_features=mf, random_state=20)

        model = model.fit(x_tr, y_tr)

        predicted_ts = pd.DataFrame(model.predict_proba(x_ts))
        predicted_ts.columns = ['high', 'low', 'medium']
        log_loss_ts = log_loss(y_ts, predicted_ts.as_matrix())
        loss_ts.append(log_loss_ts)
    
    print 'CV loss:', np.mean(loss_ts)
    predicted_val = pd.DataFrame(model.predict_proba(x_val_best))
    predicted_val.columns = ['high', 'low', 'medium']
    log_loss_val = log_loss(y_val, predicted_val.as_matrix())
    print 'Val loss:', log_loss_val
    return np.mean(loss_ts)

In [153]:
n_estimators = [500]
learning_rates = [0.4, 0.6, 0.8, 1]
max_features = ['sqrt',0.2, 0.4, 0.8]
max_depth = [1, 3, 5]#, 12, 14, 16, 18, 20]
best_loss = 100
best_params = ''
count = 50
for ne in n_estimators:
    for lr in learning_rates:
        for md in max_depth:
            for mf in max_features:
                print count
                count += 1
                params = 'n_estimator: ' + str(ne) + '; learning_rate: ' + str(lr) + '; max_features: ' + str(mf) + '; max_depth: ' + str(md)
                print params
    #             model = RandomForestClassifier(n_estimators=500, max_features=mf, max_depth=md, min_samples_split=mss, random_state=0)
    #             model = model.fit(x_train_best, y_train)
    #             predicted_train = pd.DataFrame(model.predict_proba(x_train_best))
    #             predicted_train.columns = ['high', 'low', 'medium']
    #             log_loss_train = log_loss(y_train, predicted_train.as_matrix())
    #             print 'Train loss:', log_loss_train
    #             predicted_val = pd.DataFrame(model.predict_proba(x_val_best))
    #             predicted_val.columns = ['high', 'low', 'medium']
    #             log_loss_val = log_loss(y_val, predicted_val.as_matrix())
    #             print 'Val loss:', log_loss_val
    #             if log_loss_val < best_loss:
    #                 best_loss = log_loss_val
    #                 best_params = params
                cv_loss = get_cv_loss(ne, lr, md, mf)
                if cv_loss < best_loss:
                    best_loss = cv_loss
                    best_params = params   

print 'best_loss:', best_loss
print 'best_params:', best_params
# best_loss: 0.638016960424
# best_params: max_features: 0.3; max_depth: 18; min_samples_split: 20

50
n_estimator: 500; learning_rate: 0.4; max_features: sqrt; max_depth: 1
CV loss: 0.625799776246
Val loss: 0.628988463414
51
n_estimator: 500; learning_rate: 0.4; max_features: 0.2; max_depth: 1
CV loss: 0.624416750049
Val loss: 0.622868663987
52
n_estimator: 500; learning_rate: 0.4; max_features: 0.4; max_depth: 1
CV loss: 0.624925251005
Val loss: 0.620767325559
53
n_estimator: 500; learning_rate: 0.4; max_features: 0.8; max_depth: 1
CV loss: 0.626268963256
Val loss: 0.620891598599
54
n_estimator: 500; learning_rate: 0.4; max_features: sqrt; max_depth: 3


KeyboardInterrupt: 



## Competition Test Set

In [32]:
test_df = pd.read_csv('test_feats_max_desc.csv')

In [33]:
test_df.head(2)

Unnamed: 0,index,bathrooms,bedrooms,created,latitude,listing_id,longitude,price,num_photos,elevator,hardwood floors,doorman,dishwasher,laundry in building,no fee,fitness center,pre-war,roof deck,outdoor space,dining room,high speed internet,balcony,swimming pool,new construction,terrace,exclusive,loft,garden/patio,wheelchair access,fireplace,simplex,lowrise,garage,reduced fee,furnished,multi-level,high ceilings,private outdoor space,parking space,live in super,renovated,green building,storage,stainless steel appliances,light,granite kitchen,bike room,exposed brick,marble bath,pets on approval,walk in closet(s),valet,subway,residents lounge,highrise,short term allowed,childrens playroom,no pets,duplex,actual apt. photos,central a/c,view,live/work,virtual doorman,sauna,microwave,shares ok,post-war,brownstone,business center,sublet,midrise,pet friendly,guarantors accepted,attended lobby,package room,video intercom,community recreation facilities,skylight,flex-2,cable/satellite tv,all utilities included
0,0,1.0,1,1465623000.0,40.7185,7142618,-73.9865,2950,8,1,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1.0,2,1466750000.0,40.7278,7210040,-74.0,2850,3,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [34]:
x_test = test_df[x_train_best.columns]
x_test.shape

(74659, 60)

In [35]:
pred_x = pd.DataFrame(model.predict_proba(x_test))
pred_x.columns = ['high', 'low', 'medium']
pred_x.head()

Unnamed: 0,high,low,medium
0,0.048373,0.618934,0.332693
1,0.159526,0.427932,0.412542
2,0.02481,0.853985,0.121205
3,0.029124,0.641466,0.32941
4,0.010049,0.890598,0.099354


In [36]:
subm = pd.merge(test_df[['listing_id']].reset_index(), pred_x.reset_index(), left_index=True, right_index=True)

In [37]:
subm = subm[['listing_id', 'high', 'medium', 'low']]

In [38]:
subm.shape
# (74659, 4)

(74659, 4)

In [39]:
subm.to_csv('Submission_GB_auto_tune2.csv', index=None)

In [136]:
model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=3,
              max_features=0.4, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)