# W207 Forest Cover - KNN v2

## Plan

#### Feature Engineering

- More data cleaning, removing useless data columns

- Feature engineering: using only the most important features

- Feature engineering: combine some of the soil types

#### Error Analysis

- Confusion matrix: find out which types give the most problems

- Any clue from EDA?


In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

# 0. Load Data

In [2]:
data_train = pd.read_csv('train.csv')             # read training data
data_test = pd.read_csv('test.csv')

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(data_train, random_state=1)

In [3]:
df_train.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

#### Feature vectors

In [4]:
# to keep track of things
target = df_train.columns[-1]
all_features_ALL = df_train.columns[1:-1]

num_features = df_train.columns[1:11]
cat_features_ALL = df_train.columns[11:-1]

wild_features = df_train.columns[11:15]
soil_features_ALL = df_train.columns[15:-1]

#### Filtered features

In [5]:
# remove constant Soil_Type7 and Soil_Type15
all_features = all_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])
cat_features = cat_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])
soil_features = soil_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])

In [6]:
top17_features = all_features_ALL[[0,5,9,13,3,4,6,1,8,7,23,51,16,2,52,10,12]]

top10_features = all_features_ALL[[0,5,9,13,3,4,6,1,8,7]]

top5_features = all_features_ALL[[0,5,9,13,3]]

top2_features = all_features_ALL[[0,5]]

In [7]:
top17_features

Index(['Elevation', 'Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area4',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Hillshade_9am', 'Aspect', 'Hillshade_3pm', 'Hillshade_Noon',
       'Soil_Type10', 'Soil_Type38', 'Soil_Type3', 'Slope', 'Soil_Type39',
       'Wilderness_Area1', 'Wilderness_Area3'],
      dtype='object')

# 1. KNN Pipeline

- input a list of features

- separate features into numeric and categorical

- using StandardScaling on the numeric features

- combine numeric and categorical features

- PCA optional

- fit KNN

### Note:

- Adding MinMaxScaler and Normalizer doesn't seem to help

In [8]:
features = top17_features

num_features 

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points'],
      dtype='object')

In [9]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

# STEP 1: Define the features  #########################################
def get_num_features(features):
    num_features = [item for item in features if 'Soil' not in item and 'Wilderness' not in item ]
    return num_features

def get_cat_features(features):
    cat_features = [item for item in features if 'Soil' in item or 'Wilderness' in item ]
    return cat_features

features = all_features

num_features = get_num_features(features)
cat_features = get_cat_features(features)

# STEP 2: Define the KNN pipeline ######################################

def select_num_features(X):
    return X[num_features]

def select_cat_features(X):
    return X[cat_features]

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

num_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_num_features, validate=False)),
    ('scale', StandardScaler())
])

cat_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_cat_features, validate=False) )    
])

fu = FeatureUnion([
    ('numeric', num_feature_pipeline),
    ('categorical', cat_feature_pipeline)
])

In [121]:
knn_num = 1

knn_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', KNeighborsClassifier(n_neighbors=knn_num))
])

# knn_pipe.fit(df_train, df_train[target])

In [11]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
knn_pipe.fit(data_train, data_train[target])

pred = knn_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_top2_knn_' + str(knn_num) + '.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# Check Scores of KNN Pipeline on Kaggle 

Use all_features, top17_features, top10_features, top5_features, top2_features and test the above pipeline with knn=1,2,5,10 and score on Kaggle, following are the results:

### all_features

- knn = 1: 0.6906

- knn = 2: 0.68584

- knn = 5: 0.64546

- knn = 10: 0.61504


### top17_features

- knn = 1: 0.65998

- knn = 2: 0.66320

- knn = 5: 0.62491

- knn = 10: 0.59760


### top10_features

- knn = 1: 0.63003

- knn = 2: 0.63973

- knn = 5: 0.59013

- knn = 10: 0.55603


### top5_features

- knn = 1: 0.65710

- knn = 2: 0.65603

- knn = 5: 0.61171

- knn = 10: 0.58272


### top2_features

- knn = 1: 0.48068

- knn = 2: 0.52666

- knn = 5: 0.50603

- knn = 10: 0.50100



### Error Analysis

#### ROC curve is not applicable in this case, it is strictly restricted to binary cases

In [12]:
y_test = df_test[target]                # get correct labels
y_pred_knn = knn_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [15]:
from sklearn.metrics import f1_score, classification_report

F1_score_test = f1_score(y_test, y_pred_knn, average='weighted').round(4)
report_knn = classification_report(y_test, y_pred_knn)

# print(F1_score_test)
print(report_knn)

             precision    recall  f1-score   support

          1       1.00      1.00      1.00       552
          2       1.00      1.00      1.00       561
          3       1.00      1.00      1.00       530
          4       1.00      1.00      1.00       563
          5       1.00      1.00      1.00       513
          6       1.00      1.00      1.00       529
          7       1.00      1.00      1.00       532

avg / total       1.00      1.00      1.00      3780



#### cross-validation score

We would like to get a better estimate of the performance without contaminating the test data. Try using cross-validation to estimate the performance.

(The results seem similar)

In [14]:
from sklearn.model_selection import cross_val_score
scores_knn = cross_val_score(knn_pipe, df_train, df_train[target], cv=5)
print(scores_knn.round(4))
print(scores_knn.mean().round(4))

[0.7878 0.8075 0.7932 0.7771 0.8009]
0.7933


#### Confusion matrix

In [16]:
from sklearn.metrics import confusion_matrix, roc_curve

confusion_knn = confusion_matrix(y_test, y_pred_knn, labels=[1,2,3,4,5,6,7])
print(confusion_knn)

[[552   0   0   0   0   0   0]
 [  0 561   0   0   0   0   0]
 [  0   0 530   0   0   0   0]
 [  0   0   0 563   0   0   0]
 [  0   0   0   0 513   0   0]
 [  0   0   0   0   0 529   0]
 [  0   0   0   0   0   0 532]]


### Commnets:

- Cover_Types 4 and 7 have the best performance, from EDA, Cover_Type4 is the one live at the lowest elevation, Cover_Type7 is the one live at the highest elevation;

- There is a lot of confusing between 1 and 2: they live in similar elevations just below 7, there is also a lot of confusion between 1 and 7;

- Confusion between 6 and 3, 6 and 4 can also be explained by the closeness in elevation

# 2. Logistic Regression Pipeline

In [17]:
from sklearn.linear_model import LogisticRegression

lr_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', LogisticRegression(penalty='l2', C=2.0))
])

lr_pipe.fit(df_train, df_train[target])

y_pred_lr = lr_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [18]:
report_lr = classification_report(y_test, y_pred_lr)
print(report_lr)

             precision    recall  f1-score   support

          1       0.63      0.63      0.63       552
          2       0.58      0.48      0.53       561
          3       0.61      0.56      0.59       530
          4       0.82      0.89      0.85       563
          5       0.61      0.70      0.65       513
          6       0.58      0.59      0.59       529
          7       0.87      0.86      0.87       532

avg / total       0.67      0.68      0.67      3780



#### cross-validation score

In [19]:
scores_lr = cross_val_score(lr_pipe, df_train, df_train[target], cv=5)
print(scores_lr.round(4))
print(scores_lr.mean().round(4))

[0.6693 0.6789 0.6755 0.6708 0.6614]
0.6712


#### confusion matrix

In [20]:
confusion_lr = confusion_matrix(y_test, y_pred_lr, labels=[1,2,3,4,5,6,7])
print(confusion_lr)

[[349  96   1   0  36   6  64]
 [128 269   9   0 126  24   5]
 [  0   2 297  71  15 145   0]
 [  0   0  30 503   0  30   0]
 [ 11  65  51   0 361  25   0]
 [  0  25  97  42  51 314   0]
 [ 66   5   0   0   2   0 459]]


# Check LR score on Kaggle

- all_features: 0.67 on test, 0.5603 on Kaggle

- top17_features: 0.62 on test, 0.4916 on Kaggle

- top10_features: 0.58 on test, 0.45328 on Kaggle

- top5_features: 0.53 on test, 0.42169 on Kaggle

- top2_features: 0.51 on test, 0.39383 on Kaggle

In [75]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
lr_pipe.fit(data_train, data_train[target])

pred = lr_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_top5_lr_pipe.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# 3. Random Forest Pipeline

### Note:

- increase the RF n_estimators from 100 to 500 improves F1 score by 0.01

In [122]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', RandomForestClassifier(n_estimators=500, n_jobs=-1, oob_score=True, random_state=121))
])

rf_pipe.fit(df_train, df_train[target])

y_pred_rf = rf_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [22]:
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

             precision    recall  f1-score   support

          1       0.78      0.74      0.76       552
          2       0.79      0.68      0.73       561
          3       0.86      0.83      0.85       530
          4       0.94      0.97      0.96       563
          5       0.88      0.94      0.91       513
          6       0.85      0.90      0.87       529
          7       0.91      0.96      0.93       532

avg / total       0.86      0.86      0.86      3780



#### cross-validation score

In [23]:
scores_rf = cross_val_score(rf_pipe, df_train, df_train[target], cv=5)
print(scores_rf.round(4))
print(scores_rf.mean().round(4))

[0.8432 0.8533 0.862  0.8522 0.8455]
0.8512


#### confusion matrix

In [24]:
confusion_rf = confusion_matrix(y_test, y_pred_rf, labels=[1,2,3,4,5,6,7])
print(confusion_rf)

[[410  79   0   0  10   3  50]
 [101 379  12   0  49  17   3]
 [  0   3 442  26   3  56   0]
 [  0   0  12 547   0   4   0]
 [  0  14   9   0 483   7   0]
 [  0   4  37   7   3 478   0]
 [ 18   2   0   0   0   0 512]]


# Check RF score on Kaggle

- all_features: 0.85 on test, **0.7549** on Kaggle

- top17_features: 0.84 on test, 0.73545 on Kaggle

- top10_features: 0.826 on test, 0.70326 on Kaggle

- top5_features: 0.7937 on test,  0.67985 on Kaggle

- top2_features: 0.6045 on test, 0.49393 on Kaggle

In [84]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
rf_pipe.fit(data_train, data_train[target])

pred = rf_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_top2_rf_pipe.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# 4. SVM Classifier

- simple LinearSVC(C=1, loss='hinge'): test score = 0.6325


- Polynomial Kernel SVC(kernel="poly", degree=3, coef0=1, C=5): 

- degree = 4: test score = 0.7829, Kaggle Score 0.66912
- degree = 3: test score = 0.7688 
- degree = 2: test score = 0.7481


- Gaussian RBF Kernel

- gamma=5, C=0.001: test score = 0.1452

- gamma=5, C=1000: test score = 0.6663

- gamma=0.1, C=0.001: test score = 0.1452

- gamma=0.1, C=1000: test score = 0.8116, Kaggle Score **0.72226**

In [123]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures

svm_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', LinearSVC(C=1, loss="hinge"))
])

svm_pipe2 = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', SVC(kernel="poly", degree=4, coef0=1, C=5))
])

svm_pipe3 = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', SVC(kernel="rbf", gamma=0.1, C=1000))
])


svm_pipe2.fit(df_train, df_train[target])

y_pred_svm = svm_pipe2.predict(df_test)  # get predicted labels

In [80]:
report_svm = classification_report(y_test, y_pred_svm)
print(report_svm)

             precision    recall  f1-score   support

          1       0.71      0.68      0.70       552
          2       0.72      0.59      0.65       561
          3       0.73      0.69      0.71       530
          4       0.88      0.96      0.92       563
          5       0.79      0.89      0.83       513
          6       0.75      0.76      0.75       529
          7       0.89      0.93      0.91       532

avg / total       0.78      0.78      0.78      3780



In [79]:
scores_svm = cross_val_score(svm_pipe2, df_train, df_train[target], cv=5)
print(scores_svm.round(4))
print(scores_svm.mean().round(4))

[0.7763 0.7859 0.7945 0.7798 0.7779]
0.7829


In [74]:
confusion_svm = confusion_matrix(y_test, y_pred_svm, labels=[1,2,3,4,5,6,7])
print(confusion_svm)

[[404 103   0   0   6   1  38]
 [140 346   7   0  49  14   5]
 [  0  12 421  21   6  70   0]
 [  0   1  20 526   0  16   0]
 [  8  17  11   0 473   4   0]
 [  0  11  58   6   6 448   0]
 [ 21   7   0   0   0   0 504]]


In [81]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
svm_pipe2.fit(data_train, data_train[target])

pred = svm_pipe2.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_polynomial_svm_deg4.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# 5. AdaBoost Classifier

- max_depth=6, n_estimators=200, learning_rate=0.5, test_score 0.74
- max_depth=6, n_estimators=500, learning_rate=0.5, test_score 0.76
- max_depth=10, n_estimators=500, learning_rate=0.5, test_score 0.8406, Kaggle score 0.75461
- max_depth=15, n_estimators=500, learning_rate=0.5, test_score 0.854,  Kaggle score **0.77099**
- max_depth=20, n_estimators=500, learning_rate=0.5, test_score 0.8549, Kaggle score 0.76533

#### max=15, n_estimators=500, learning rate =0.854 seems to be the best

In [124]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', AdaBoostClassifier(DecisionTreeClassifier(max_depth=15), 
                                   n_estimators=500, algorithm="SAMME.R", learning_rate=0.5))
])
    
ada_pipe.fit(df_train, df_train[target])

y_pred_ada = ada_pipe.predict(df_test)  # get predicted labels   

In [116]:
report_ada = classification_report(y_test, y_pred_ada)
print(report_ada)

             precision    recall  f1-score   support

          1       0.75      0.77      0.76       552
          2       0.77      0.68      0.72       561
          3       0.88      0.87      0.87       530
          4       0.96      0.97      0.96       563
          5       0.89      0.93      0.91       513
          6       0.88      0.92      0.90       529
          7       0.93      0.95      0.94       532

avg / total       0.86      0.87      0.87      3780



In [117]:
scores_ada = cross_val_score(ada_pipe, df_train, df_train[target], cv=5)

print(scores_ada.round(4))
print(scores_ada.mean().round(4))

[0.8507 0.8604 0.8629 0.8526 0.8481]
0.8549


In [118]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
ada_pipe.fit(data_train, data_train[target])

pred = ada_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_ada_maxdepth20_n500.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# 4. Gradient Boosting Classifier

## somehow not working

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(subsample=.7, n_estimators=100, max_depth=3, 
                                learning_rate=0.01,  random_state=3)

gb.fit(df_train, df_train[target])

y_pred_gb = gb.predict(df_test)  # get predicted labels

#### F1 score

In [26]:
report_gb = classification_report(y_test, y_pred_gb)
print(report_gb)

             precision    recall  f1-score   support

          1       1.00      1.00      1.00       552
          2       1.00      1.00      1.00       561
          3       1.00      1.00      1.00       530
          4       1.00      1.00      1.00       563
          5       1.00      1.00      1.00       513
          6       1.00      1.00      1.00       529
          7       1.00      1.00      1.00       532

avg / total       1.00      1.00      1.00      3780



#### cross-validation score

In [27]:
scores_gb = cross_val_score(gb, df_train, df_train[target], cv=5)
print(scores_gb.round(4))
print(scores_gb.mean().round(4))

[1. 1. 1. 1. 1.]
1.0


#### confusion matrix

In [28]:
confusion_gb = confusion_matrix(y_test, y_pred_gb, labels=[1,2,3,4,5,6,7])
print(confusion_gb)

[[552   0   0   0   0   0   0]
 [  0 561   0   0   0   0   0]
 [  0   0 530   0   0   0   0]
 [  0   0   0 563   0   0   0]
 [  0   0   0   0 513   0   0]
 [  0   0   0   0   0 529   0]
 [  0   0   0   0   0   0 532]]


In [None]:
parameter_grid = {
    'max_depth': range(1,6),
    'learning_rate': [.01, .05, .1],
    'max_features': [2, 5, 'auto']
}

param_searcher = GridSearchCV(gb, parameter_grid, cv=5)
param_searcher.fit(df_train, df_train[target])

In [None]:
param_searcher.best_params_

In [None]:
param_searcher.best_score_

In [None]:
GradientBoostingClassifier?

# 5. Voting Classifier

In [130]:
from sklearn.ensemble import VotingClassifier

vote_pipe = VotingClassifier(estimators =[('svm', svm_pipe), ('rf', rf_pipe), ('ada', ada_pipe)], 
                            voting = 'hard')

vote_pipe.fit(df_train, df_train[target])

y_pred_vote = vote_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [131]:
report_vote = classification_report(y_test, y_pred_vote)
print(report_vote)

             precision    recall  f1-score   support

          1       0.74      0.76      0.75       552
          2       0.79      0.64      0.70       561
          3       0.88      0.83      0.85       530
          4       0.95      0.98      0.96       563
          5       0.88      0.93      0.91       513
          6       0.84      0.91      0.87       529
          7       0.91      0.96      0.94       532

avg / total       0.85      0.86      0.85      3780



#### cross-validation score

In [132]:
scores_vote = cross_val_score(vote_pipe, df_train, df_train[target], cv=5)
print(scores_vote.round(4))
print(scores_vote.mean().round(4))

[0.8454 0.8493 0.8567 0.8473 0.8442]
0.8486


#### confusion matrix

In [36]:
confusion_vote = confusion_matrix(y_test, y_pred_vote, labels=[1,2,3,4,5,6,7])
print(confusion_vote)

[[419  75   0   0  10   2  46]
 [113 362   9   0  62  12   3]
 [  0   4 411  36   4  75   0]
 [  0   0  14 542   0   7   0]
 [  2  15  10   0 481   5   0]
 [  0   4  44  13   7 461   0]
 [ 19   2   0   0   0   0 511]]


# Check Voting CLF score on Kaggle

### all_features, (knn_pipe, lr_pipe, rf_pipe) for the vote_pipe

- voting = "hard": test score is 0.831, Kaggle Score is 0.72975

- voting = "soft": test sore is 0.8049, Kaggle Score is 0.69869 

### all_features, (knn_pipe, svm_pipe, rf_pipe, ada_pipe) for the vote_pipe

- voting = "hard": test score is 0.8482, Kaggle Score is 0.74582

### all_features, (svm_pipe, rf_pipe, ada_pipe) for the vote_pipe

- voting = "hard": test score is 0.8486, Kaggle Score is 0.74911

In [133]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
vote_pipe.fit(data_train, data_train[target])

pred = vote_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_all_vote_pipe_svm_rf_ada_hard.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)