In [4]:
import pandas as pd
import numpy as np

import sklearn

In [23]:
test_df = pd.read_csv('../test.csv')
x_test = test_df.values

train_df = pd.read_csv('../train.csv', delimiter=',')

y_train = train_df.values[:,0]
x_train = train_df.values[:,1:len(train_df.columns)].astype(float)

In [25]:
from sklearn.preprocessing import LabelEncoder

def encode(y):
    encoder = LabelEncoder()
    encoder.fit(y)
    y = encoder.transform(y)
    return y, encoder
    
y_train, y_encoder = encode(y_train)
print(y_train.shape)
print(y_encoder.classes_)
y_train.view()

(178,)
['cold' 'dusk' 'flowers' 'impressions' 'oval' 'scene' 'trees' 'water']


array([4, 2, 0, 5, 7, 3, 3, 3, 0, 3, 6, 1, 7, 6, 6, 7, 7, 2, 5, 1, 4, 1,
       2, 7, 2, 6, 2, 2, 7, 0, 0, 6, 6, 1, 2, 6, 5, 5, 0, 1, 1, 2, 0, 5,
       5, 3, 7, 7, 2, 0, 7, 1, 7, 7, 1, 2, 3, 3, 3, 7, 5, 5, 1, 7, 3, 5,
       2, 5, 0, 7, 5, 1, 1, 5, 7, 0, 5, 5, 3, 3, 0, 0, 6, 3, 5, 2, 1, 1,
       2, 0, 4, 7, 0, 4, 0, 7, 5, 5, 1, 6, 1, 2, 7, 1, 3, 3, 0, 0, 1, 2,
       6, 2, 1, 7, 0, 0, 5, 2, 0, 1, 1, 7, 1, 1, 0, 4, 0, 1, 0, 5, 7, 6,
       1, 2, 5, 5, 1, 6, 6, 1, 7, 7, 6, 7, 5, 3, 7, 2, 1, 7, 5, 3, 5, 5,
       6, 6, 0, 5, 7, 1, 7, 1, 2, 7, 6, 7, 5, 7, 7, 5, 7, 3, 2, 2, 7, 1,
       7, 6])

In [43]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

In [44]:
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(x_train.shape[0], NFOLDS)

# Class to extend the Sklearn Regressor
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [45]:
def get_oof(clf, x_train, y_train, x_test):
    num_train = x_train.shape[0]
    num_test = x_test.shape[0]
    
    oof_train = np.zeros((num_train,))
    oof_test = np.zeros((num_test,))
    oof_test_skf = np.empty((NFOLDS, num_test))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [46]:
# Put in our parameters for said regressors
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 50,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':50,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 50,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 50,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [47]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [48]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf, x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc, x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

  warn("Warm-start fitting without increasing n_estimators does not "


Training is complete


In [53]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [67]:
rf_ensemble = RandomForestClassifier().fit(x_train, y_train)

In [70]:
id_data = np.r_[1:64]
class_data = y_encoder.inverse_transform(rf_ensemble.predict(x_test))

  if diff:


In [72]:
test_df = pd.DataFrame(data={'id': id_data, 'class': class_data}, columns=['id', 'class'])
test_df.to_csv('rf_ensemble.csv', index=False)