# Ensemble Learning with Iris

Analytics Vidhya threw open a data hackathon for the 8th and 9th August weekend.
Details can be found [here](http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838)
This is my attempt to generate some benchmarks - with no(to minimal) feature engineering

In [1]:
#Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

In [2]:
#Read the train and test datasets
# train = pd.read_csv("Data/Train.csv")
# test = pd.read_csv("Data/Test.csv")
iris = load_iris()
train_data, test_data, train_target, test_target = train_test_split(
        iris.data, iris.target, test_size=0.4, random_state=415)

In [3]:
print train_data.shape, test_data.shape

(90, 4) (60, 4)


In [4]:
xgtrain = train_data
xgtest = test_data
label_pv = train_target


# First Approach - Regression


##First model: xgboost

In [5]:
params = {}
params["min_child_weight"] = 3
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 1
params["silent"] = 0
params["max_depth"] = 4
params["nthread"] = 6
params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.005
params["base_score"] = 0.1
params["eval_metric"] = "auc"
params["seed"] = 123

In [6]:
plst = list(params.items())
num_rounds = 600

In [7]:
xgtrain_pv = xgb.DMatrix(xgtrain, label=label_pv)
watchlist = [(xgtrain_pv, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain_pv, num_rounds)
model_1_predict = model_1_xgboost.predict(xgb.DMatrix(xgtest))

In [8]:
model_1_predict

array([ 0.13755664,  1.53150272,  0.94632232,  1.73526251,  1.51423669,
        0.13536169,  1.78053868,  0.98128557,  1.00947142,  1.20258307,
        0.13536169,  0.12651075,  1.56973004,  0.13536169,  1.17833805,
        0.81763804,  0.81763804,  0.12651075,  0.84986347,  0.13536169,
        1.11069334,  1.78938961,  0.84986347,  0.2625584 ,  0.12651075,
        0.84986347,  1.78053868,  0.1413871 ,  0.82508171,  0.12651075,
        1.78053868,  0.95534474,  0.12651075,  0.82459211,  0.13536169,
        0.81763804,  0.84986347,  0.12651075,  1.78938961,  0.12651075,
        1.78938961,  0.13536169,  1.01883912,  0.12651075,  0.13536169,
        1.7523526 ,  0.1347426 ,  1.47047174,  0.2625584 ,  1.73526251,
        0.12651075,  1.78053868,  1.50741839,  0.12589167,  1.78709161,
        0.12651075,  0.13253616,  1.78709161,  1.78938961,  0.93506271], dtype=float32)

In [9]:
model_1_predict = np.array([int(round(item)) for item in model_1_predict])
model_1_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 1, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [10]:
test_target

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 2, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 1, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [11]:
print(np.sum(model_1_predict!=test_target))

1


##Model 2: RandomForest

In [12]:
model_2_rf = RandomForestRegressor(n_estimators=400, max_depth=8, oob_score=True, n_jobs=6, random_state=123)
model_2_rf.fit(xgtrain, label_pv)
model_2_rf.oob_score_
model_2_predict = model_2_rf.predict(xgtest)

In [13]:
model_2_predict = np.array([int(round(item)) for item in model_2_predict])
model_2_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [14]:
print(np.sum(model_2_predict!=test_target))

2


##Model 3: ExtraTrees

In [15]:
model_3_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_3_et.fit(xgtrain, label_pv)
model_3_predict = model_3_et.predict(xgtest)

[Parallel(n_jobs=6)]: Done   1 out of 750 | elapsed:    0.1s remaining:   39.8s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.5s finished
[Parallel(n_jobs=6)]: Done   1 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.3s finished


In [17]:
model_3_predict = np.array([int(round(item)) for item in model_3_predict])
model_3_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [18]:
print(np.sum(model_3_predict!=test_target))

2


##Model 7: Ridge

In [19]:
model_7_ridge = linear_model.Ridge(alpha=0.01)
model_7_ridge.fit(xgtrain, label_pv)
model_7_predict = model_7_ridge.predict(xgtest)

In [20]:
model_7_predict = np.array([int(round(item)) for item in model_7_predict])
model_7_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [21]:
print(np.sum(model_7_predict!=test_target))

2


In [None]:
#rmse of 505.99

##Model 8: Lasso

In [26]:
model_8_lasso = linear_model.Lasso()
model_8_lasso.fit(xgtrain, label_pv)
model_8_predict = model_8_lasso.predict(xgtest)

In [28]:
model_8_predict = np.array([int(round(item)) for item in model_8_predict])
model_8_predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [29]:
print(np.sum(model_8_predict!=test_target))

42


In [51]:
#rmse of 498

##Model 9: SGD Regressor

In [30]:
model_9_sgd = linear_model.SGDRegressor()
model_9_sgd.fit(xgtrain, label_pv)
model_9_predict = model_9_sgd.predict(xgtest)

model_9_predict = np.array([int(round(item)) for item in model_9_predict])
model_9_predict

array([0, 2, 1, 2, 1, 0, 2, 1, 1, 2, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [31]:
print(np.sum(model_9_predict!=test_target))

3


In [None]:
#Omit this model. Horrible output

##Model 10: Perceptron

In [32]:
model_10_perceptron = linear_model.Perceptron(penalty="l1", n_iter=250, random_state=123, n_jobs=6)
model_10_perceptron.fit(xgtrain, label_pv)
model_10_predict = model_10_perceptron.predict(xgtest)

model_10_predict = np.array([int(round(item)) for item in model_10_predict])
model_10_predict

array([1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 0, 0, 2, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 2,
       1, 2, 0, 2, 0, 2, 1, 0, 2, 1, 0, 2, 2, 1])

In [33]:
print(np.sum(model_10_predict!=test_target))

14


In [None]:
#Omit this model. Takes a long time to run

##Model 11: Kernel Ridge

In [34]:
model_11_kr = KernelRidge(alpha=0.01, kernel='rbf', gamma=0.1)
model_11_kr.fit(xgtrain, label_pv)
model_11_predict = model_11_kr.predict(xgtest)

model_11_predict = np.array([int(round(item)) for item in model_11_predict])
model_11_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 2, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [35]:
print(np.sum(model_11_predict!=test_target))

1


In [None]:
#rmse: 753

#Model 12: SVR

In [36]:
model_12_svr = KernelRidge(alpha=0.01, kernel='rbf')
model_12_svr.fit(xgtrain, label_pv)
model_12_predict = model_12_svr.predict(xgtest)

model_12_predict = np.array([int(round(item)) for item in model_12_predict])
model_12_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 2, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [37]:
print(np.sum(model_11_predict!=test_target))

1


In [None]:
#Not going to run this - due to time constraints. Kernel ridge itself took a long time to run.

#Stacking

Stacking the above 9 models to see if it produces better output

In [42]:
feature_1 = model_1_xgboost.predict(xgb.DMatrix(xgtrain))
feature_2 = model_2_rf.predict(xgtrain)
feature_3 = model_3_et.predict(xgtrain)
feature_7 = model_7_ridge.predict(xgtrain)
feature_8 = model_8_lasso.predict(xgtrain)
feature_9 = model_9_sgd.predict(xgtrain)
feature_10 = model_10_perceptron.predict(xgtrain)
feature_11 = model_11_kr.predict(xgtrain)
feature_12 = model_12_svr.predict(xgtrain)


[Parallel(n_jobs=6)]: Done   1 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.3s finished


In [43]:
#Train features
train_features = np.vstack((feature_1, feature_2, feature_3, feature_7, feature_8, feature_9, feature_10, feature_11, feature_12))
#Need to transpose features
train_features = train_features.T

In [45]:
#Test features
test_features = np.vstack((model_1_predict, model_2_predict, model_3_predict, model_7_predict, model_8_predict, model_9_predict, model_10_predict, model_11_predict, model_12_predict ))
#Need to transpose features
test_features = test_features.T

In [57]:
np.vstack((1,2,3,4)).T

array([[1, 2, 3, 4]])

##First stack model: Extra Trees

In [46]:
model_13_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_13_et.fit(train_features, label_pv)
model_13_predict = model_13_et.predict(test_features)

[Parallel(n_jobs=6)]: Done   1 out of  42 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.4s finished
[Parallel(n_jobs=6)]: Done   1 out of  19 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.3s finished


In [47]:
model_13_predict = np.array([int(round(item)) for item in model_13_predict])
model_13_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [48]:
print(np.sum(model_13_predict!=test_target))

2


In [None]:
#rmse 696

##Second stack model: Ridge

In [49]:
model_14_ridge = linear_model.Ridge(alpha=0.01)
model_14_ridge.fit(train_features, label_pv)
model_14_predict = model_14_ridge.predict(test_features)
model_14_predict[model_14_predict<0] = 0

model_14_predict = np.array([int(round(item)) for item in model_14_predict])
model_14_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [50]:
print(np.sum(model_14_predict!=test_target))

2


In [None]:
#rmse 740

##Third stack model: Model 3 (Extra Trees) + output features of all the 9 models

In [51]:
#Concatenating features + train/test dataset
xgtrain3 = np.hstack((xgtrain, train_features))
xgtest3  = np.hstack((xgtest, test_features))

print xgtrain3.shape, xgtest3.shape

(90, 13) (60, 13)


In [52]:
model_15_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_15_et.fit(xgtrain3, label_pv)
model_15_predict = model_15_et.predict(xgtest3)

model_15_predict = np.array([int(round(item)) for item in model_15_predict])
model_15_predict

[Parallel(n_jobs=6)]: Done   1 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.5s finished
[Parallel(n_jobs=6)]: Done   1 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done 750 out of 750 | elapsed:    0.4s finished


array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [53]:
print(np.sum(model_15_predict!=test_target))

2


In [None]:
#rmse 693

##Fourth stack model: Ridge

In [54]:
model_16_ridge = linear_model.Ridge(alpha=0.01)
model_16_ridge.fit(xgtrain3, label_pv)
model_16_predict = model_16_ridge.predict(xgtest3)
model_16_predict[model_16_predict<0] = 0

model_16_predict = np.array([int(round(item)) for item in model_16_predict])
model_16_predict

array([0, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1,
       0, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2,
       0, 3, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1])

In [55]:
print(np.sum(model_16_predict!=test_target))

2


In [None]:
#rmse 740