# W207 Forest Cover - KNN v2

## Plan

#### Feature Engineering

- More data cleaning, removing useless data columns

- Feature engineering: using only the most important features

- Feature engineering: combine some of the soil types

#### Error Analysis

- Confusion matrix: find out which types give the most problems

- Any clue from EDA?


In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

# 0. Load Data

In [2]:
data_train = pd.read_csv('train.csv')             # read training data
data_test = pd.read_csv('test.csv')

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(data_train, random_state=1)

In [3]:
df_train.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

#### Feature vectors

In [4]:
# to keep track of things
target = df_train.columns[-1]
all_features_ALL = df_train.columns[1:-1]

num_features = df_train.columns[1:11]
cat_features_ALL = df_train.columns[11:-1]

wild_features = df_train.columns[11:15]
soil_features_ALL = df_train.columns[15:-1]

#### Filtered features

In [5]:
# remove constant Soil_Type7 and Soil_Type15
all_features = all_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])
cat_features = cat_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])
soil_features = soil_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])

In [6]:
top17_features = all_features_ALL[[0,5,9,13,3,4,6,1,8,7,23,51,16,2,52,10,12]]

top10_features = all_features_ALL[[0,5,9,13,3,4,6,1,8,7]]

top5_features = all_features_ALL[[0,5,9,13,3]]

top2_features = all_features_ALL[[0,5]]

In [7]:
top17_features

Index(['Elevation', 'Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area4',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Hillshade_9am', 'Aspect', 'Hillshade_3pm', 'Hillshade_Noon',
       'Soil_Type10', 'Soil_Type38', 'Soil_Type3', 'Slope', 'Soil_Type39',
       'Wilderness_Area1', 'Wilderness_Area3'],
      dtype='object')

# 1. KNN Pipeline

- input a list of features

- separate features into numeric and categorical

- using StandardScaling on the numeric features

- combine numeric and categorical features

- PCA optional

- fit KNN

### Note:

- Adding MinMaxScaler and Normalizer doesn't seem to help

In [15]:
features = top17_features

num_features 

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points']

In [80]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

# STEP 1: Define the features  #########################################
def get_num_features(features):
    num_features = [item for item in features if 'Soil' not in item and 'Wilderness' not in item ]
    return num_features

def get_cat_features(features):
    cat_features = [item for item in features if 'Soil' in item or 'Wilderness' in item ]
    return cat_features

features = top2_features

num_features = get_num_features(features)
cat_features = get_cat_features(features)

# STEP 2: Define the KNN pipeline ######################################

def select_num_features(X):
    return X[num_features]

def select_cat_features(X):
    return X[cat_features]

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

num_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_num_features, validate=False)),
    ('scale', StandardScaler())
])

cat_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_cat_features, validate=False) )    
])

fu = FeatureUnion([
    ('numeric', num_feature_pipeline),
    ('categorical', cat_feature_pipeline)
])

In [17]:
knn_num = 10

knn_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', KNeighborsClassifier(n_neighbors=knn_num))
])

# knn_pipe.fit(df_train, df_train[target])

In [None]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
knn_pipe.fit(data_train, data_train[target])

pred = knn_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_top2_knn_' + str(knn_num) + '.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# Check Scores of KNN Pipeline on Kaggle 

Use all_features, top17_features, top10_features, top5_features, top2_features and test the above pipeline with knn=1,2,5,10 and score on Kaggle, following are the results:

### all_features

- knn = 1: 0.6906

- knn = 2: 0.68584

- knn = 5: 0.64546

- knn = 10: 0.61504


### top17_features

- knn = 1: 0.65998

- knn = 2: 0.66320

- knn = 5: 0.62491

- knn = 10: 0.59760


### top10_features

- knn = 1: 0.63003

- knn = 2: 0.63973

- knn = 5: 0.59013

- knn = 10: 0.55603


### top5_features

- knn = 1: 0.65710

- knn = 2: 0.65603

- knn = 5: 0.61171

- knn = 10: 0.58272


### top2_features

- knn = 1: 0.48068

- knn = 2: 0.52666

- knn = 5: 0.50603

- knn = 10: 0.50100



### Error Analysis

#### ROC curve is not applicable in this case, it is strictly restricted to binary cases

In [19]:
y_test = df_test[target]                # get correct labels
y_pred_knn = knn_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [20]:
from sklearn.metrics import f1_score, classification_report

F1_score_test = f1_score(y_test, y_pred_knn, average='weighted').round(4)
report_knn = classification_report(y_test, y_pred_knn)

# print(F1_score_test)
print(report_knn)

             precision    recall  f1-score   support

          1       0.76      0.71      0.73       552
          2       0.81      0.62      0.70       561
          3       0.80      0.75      0.78       530
          4       0.91      0.96      0.94       563
          5       0.80      0.94      0.87       513
          6       0.79      0.84      0.81       529
          7       0.89      0.96      0.92       532

avg / total       0.82      0.82      0.82      3780



#### cross-validation score

We would like to get a better estimate of the performance without contaminating the test data. Try using cross-validation to estimate the performance.

(The results seem similar)

In [21]:
from sklearn.model_selection import cross_val_score
scores_knn = cross_val_score(knn_pipe, df_train, df_train[target], cv=5)
print(scores_knn.round(4))
print(scores_knn.mean().round(4))

[0.7389 0.7617 0.7615 0.7449 0.743 ]
0.75


#### Confusion matrix

In [22]:
from sklearn.metrics import confusion_matrix, roc_curve

confusion_knn = confusion_matrix(y_test, y_pred_knn, labels=[1,2,3,4,5,6,7])
print(confusion_knn)

[[391  68   2   0  27   7  57]
 [102 346  10   1  77  16   9]
 [  0   3 400  37   9  81   0]
 [  0   0  14 543   0   6   0]
 [  2   6  13   0 483   9   0]
 [  0   3  58  17   6 445   0]
 [ 21   1   0   0   1   0 509]]


### Commnets:

- Cover_Types 4 and 7 have the best performance, from EDA, Cover_Type4 is the one live at the lowest elevation, Cover_Type7 is the one live at the highest elevation;

- There is a lot of confusing between 1 and 2: they live in similar elevations just below 7, there is also a lot of confusion between 1 and 7;

- Confusion between 6 and 3, 6 and 4 can also be explained by the closeness in elevation

# 2. Logistic Regression Pipeline

In [72]:
from sklearn.linear_model import LogisticRegression

lr_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', LogisticRegression(penalty='l2', C=2.0))
])

lr_pipe.fit(df_train, df_train[target])

y_pred_lr = lr_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [73]:
report_lr = classification_report(y_test, y_pred_lr)
print(report_lr)

             precision    recall  f1-score   support

          1       0.68      0.39      0.50       552
          2       0.44      0.41      0.43       561
          3       0.49      0.33      0.39       530
          4       0.56      0.85      0.68       563
          5       0.39      0.79      0.53       513
          6       0.29      0.00      0.01       529
          7       0.74      0.94      0.83       532

avg / total       0.51      0.53      0.48      3780



#### cross-validation score

In [74]:
scores_lr = cross_val_score(lr_pipe, df_train, df_train[target], cv=5)
print(scores_lr.round(4))
print(scores_lr.mean().round(4))

[0.5262 0.5322 0.5388 0.5344 0.5325]
0.5328


#### confusion matrix

In [50]:
confusion_lr = confusion_matrix(y_test, y_pred_lr, labels=[1,2,3,4,5,6,7])
print(confusion_lr)

[[144 158   0   0  97   1 152]
 [ 49 236   7   1 219  18  31]
 [  0  38 163 191  53  85   0]
 [  0   0 111 450   2   0   0]
 [  3 168  10   0 319  13   0]
 [  0  53 160 165  54  97   0]
 [ 27   0   0   0   5   0 500]]


# Check LR score on Kaggle

- all_features: 0.67 on test, 0.5603 on Kaggle

- top17_features: 0.62 on test, 0.4916 on Kaggle

- top10_features: 0.58 on test, 0.45328 on Kaggle

- top5_features: 0.53 on test, 0.42169 on Kaggle

- top2_features: 0.51 on test, 0.39383 on Kaggle

In [75]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
lr_pipe.fit(data_train, data_train[target])

pred = lr_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_top5_lr_pipe.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# 3. Random Forest Pipeline

### Note:

- increase the RF n_estimators from 100 to 500 improves F1 score by 0.01

In [81]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', RandomForestClassifier(n_estimators=500, n_jobs=-1, oob_score=True, random_state=121))
])

rf_pipe.fit(df_train, df_train[target])

y_pred_rf = rf_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [82]:
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

             precision    recall  f1-score   support

          1       0.58      0.55      0.56       552
          2       0.54      0.45      0.49       561
          3       0.40      0.37      0.38       530
          4       0.65      0.72      0.68       563
          5       0.68      0.79      0.73       513
          6       0.44      0.42      0.43       529
          7       0.79      0.84      0.82       532

avg / total       0.58      0.59      0.58      3780



#### cross-validation score

In [83]:
scores_rf = cross_val_score(rf_pipe, df_train, df_train[target], cv=5)
print(scores_rf.round(4))
print(scores_rf.mean().round(4))

[0.5883 0.6035 0.6049 0.6068 0.619 ]
0.6045


#### confusion matrix

In [56]:
confusion_rf = confusion_matrix(y_test, y_pred_rf, labels=[1,2,3,4,5,6,7])
print(confusion_rf)

[[410  79   0   0  10   3  50]
 [101 379  12   0  49  17   3]
 [  0   3 442  26   3  56   0]
 [  0   0  12 547   0   4   0]
 [  0  14   9   0 483   7   0]
 [  0   4  37   7   3 478   0]
 [ 18   2   0   0   0   0 512]]


# Check RF score on Kaggle

- all_features: 0.85 on test, 0.7549 on Kaggle

- top17_features: 0.84 on test, 0.73545 on Kaggle

- top10_features: 0.826 on test, 0.70326 on Kaggle

- top5_features: 0.7937 on test,  0.67985 on Kaggle

- top2_features: 0.6045 on test, 0.49393 on Kaggle

In [84]:
# STEP 3: Train the KNN_pipe and Make predictions ########################
rf_pipe.fit(data_train, data_train[target])

pred = rf_pipe.predict(data_test)
ID = data_test.Id

# STEP 4: Write to file for Kaggle Submission #############################
data_out = []
for i in range(len(ID)):
    data_out.append({'ID':ID.iloc[i], 'Cover_Type':pred[i]})

import csv
with open('test_submission_top2_rf_pipe.csv', 'wt') as fout:
    cout = csv.DictWriter(fout, ['ID','Cover_Type'])
    cout.writeheader()
    cout.writerows(data_out)

# 4. Gradient Boosting Classifier

## somehow not working

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(subsample=.7, n_estimators=100, max_depth=3, 
                                learning_rate=0.01, min_samples_leaf=1, random_state=3)

gb.fit(df_train, df_train[target])

y_pred_gb = gb.predict(df_test)  # get predicted labels

#### F1 score

In [None]:
report_gb = classification_report(y_test, y_pred_gb)
print(report_gb)

#### cross-validation score

In [None]:
scores_gb = cross_val_score(gb, df_train, df_train[target], cv=5)
print(scores_gb.round(4))
print(scores_gb.mean().round(4))

#### confusion matrix

In [None]:
confusion_gb = confusion_matrix(y_test, y_pred_gb, labels=[1,2,3,4,5,6,7])
print(confusion_gb)

In [None]:
parameter_grid = {
    'max_depth': range(1,6),
    'learning_rate': [.01, .05, .1],
    'max_features': [2, 5, 'auto']
}

param_searcher = GridSearchCV(gb, parameter_grid, cv=5)
param_searcher.fit(df_train, df_train[target])

In [None]:
param_searcher.best_params_

In [None]:
param_searcher.best_score_

In [None]:
GradientBoostingClassifier?

# 5. Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(estimators =[('knn', knn_pipe), ('lr', lr_pipe), ('rf', rf_pipe)])

vote_clf.fit(df_train, df_train[target])

y_pred_vote = vote_clf.predict(df_test)  # get predicted labels

#### F1 score

In [None]:
report_vote = classification_report(y_test, y_pred_rf)
print(report_vote)

#### cross-validation score

In [None]:
scores_vote = cross_val_score(vote_clf, df_train, df_train[target], cv=5)
print(scores_vote.round(4))
print(scores_vote.mean().round(4))

#### confusion matrix

In [None]:
confusion_vote = confusion_matrix(y_test, y_pred_vote, labels=[1,2,3,4,5,6,7])
print(confusion_vote)