# W207 Forest Cover - KNN v2

## Plan

#### Feature Engineering

- More data cleaning, removing useless data columns

- Feature engineering: using only the most important features

- Feature engineering: combine some of the soil types

#### Error Analysis

- Confusion matrix: find out which types give the most problems

- Any clue from EDA?


In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

# 0. Load Data

In [2]:
data_train = pd.read_csv('train.csv')             # read training data
data_test = pd.read_csv('test.csv')

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(data_train, random_state=1)

In [3]:
df_train.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

#### Feature vectors

In [4]:
# to keep track of things
target = df_train.columns[-1]
all_features_ALL = df_train.columns[1:-1]

num_features = df_train.columns[1:11]
cat_features_ALL = df_train.columns[11:-1]

wild_features = df_train.columns[11:15]
soil_features_ALL = df_train.columns[15:-1]

#### Filtered features

In [5]:
# remove constant Soil_Type7 and Soil_Type15
all_features = all_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])
cat_features = cat_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])
soil_features = soil_features_ALL.drop(['Soil_Type7', 'Soil_Type15'])

In [6]:
top17_features = all_features_ALL[[0,5,9,13,3,4,6,1,8,7,23,51,16,2,52,10,12]]

top10_features = all_features_ALL[[0,5,9,13,3,4,6,1,8,7]]

top5_features = all_features_ALL[[0,5,9,13,3]]

top2_features = all_features_ALL[[0,5]]

In [7]:
top17_features

Index(['Elevation', 'Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area4',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Hillshade_9am', 'Aspect', 'Hillshade_3pm', 'Hillshade_Noon',
       'Soil_Type10', 'Soil_Type38', 'Soil_Type3', 'Slope', 'Soil_Type39',
       'Wilderness_Area1', 'Wilderness_Area3'],
      dtype='object')

# 1. KNN Pipeline

- input a list of features

- separate features into numeric and categorical

- using StandardScaling on the numeric features

- combine numeric and categorical features

- PCA optional

- fit KNN

### Note:

- Adding MinMaxScaler and Normalizer doesn't seem to help

In [8]:
features = top17_features

num_features 

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points'],
      dtype='object')

In [91]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

def get_num_features(features):
    num_features = [item for item in features if 'Soil' not in item and 'Wilderness' not in item ]
    return num_features

def get_cat_features(features):
    cat_features = [item for item in features if 'Soil' in item or 'Wilderness' in item ]
    return cat_features

features = top17_features

num_features = get_num_features(features)
cat_features = get_cat_features(features)


def select_num_features(X):
    return X[num_features]

def select_cat_features(X):
    return X[cat_features]

In [92]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

num_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_num_features, validate=False)),
    ('scale', StandardScaler())
])

cat_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_cat_features, validate=False) )    
])

fu = FeatureUnion([
    ('numeric', num_feature_pipeline),
    ('categorical', cat_feature_pipeline)
])

knn_num = 2

knn_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', KNeighborsClassifier(n_neighbors=knn_num))
])

In [93]:
knn_pipe.fit(df_train, df_train[target])

Pipeline(steps=[('preprocess', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(steps=[('select', FunctionTransformer(accept_sparse=False,
          func=<function select_num_features at 0x113f3a400>,
          inv_kw_args=None, inverse_func=None, kw_args=None, pass_y=False,
          val...owski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform'))])

### Error Analysis

#### ROC curve is not applicable in this case, it is strictly restricted to binary cases

In [94]:
y_test = df_test[target]                # get correct labels
y_pred_knn = knn_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [95]:
from sklearn.metrics import f1_score, classification_report

F1_score_test = f1_score(y_test, y_pred, average='weighted').round(4)
report_knn = classification_report(y_test, y_pred_knn)

# print(F1_score_test)
print(report_knn)

             precision    recall  f1-score   support

          1       0.61      0.78      0.68       552
          2       0.65      0.51      0.57       561
          3       0.65      0.79      0.71       530
          4       0.89      0.89      0.89       563
          5       0.80      0.84      0.82       513
          6       0.83      0.61      0.71       529
          7       0.93      0.87      0.90       532

avg / total       0.76      0.75      0.75      3780



#### cross-validation score

We would like to get a better estimate of the performance without contaminating the test data. Try using cross-validation to estimate the performance.

(The results seem similar)

In [96]:
from sklearn.model_selection import cross_val_score
scores_knn = cross_val_score(knn_pipe, df_train, df_train[target], cv=5)
print(scores_knn.round(4))
print(scores_knn.mean().round(4))

[0.7367 0.7533 0.7451 0.7237 0.7466]
0.7411


#### Confusion matrix

In [97]:
from sklearn.metrics import confusion_matrix, roc_curve

confusion_knn = confusion_matrix(y_test, y_pred_knn, labels=[1,2,3,4,5,6,7])
print(confusion_knn)

[[428  76   1   0  11   0  36]
 [184 286  18   1  64   8   0]
 [  4  18 421  29   9  49   0]
 [  0   0  57 500   0   6   0]
 [ 25  31  23   0 431   3   0]
 [  5  17 129  30  23 325   0]
 [ 55  14   0   0   1   0 462]]


### Commnets:

- Cover_Types 4 and 7 have the best performance, from EDA, Cover_Type4 is the one live at the lowest elevation, Cover_Type7 is the one live at the highest elevation;

- There is a lot of confusing between 1 and 2: they live in similar elevations just below 7, there is also a lot of confusion between 1 and 7;

- Confusion between 6 and 3, 6 and 4 can also be explained by the closeness in elevation

# 2. Logistic Regression Pipeline

In [40]:
from sklearn.linear_model import LogisticRegression

lr_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', LogisticRegression(penalty='l2', C=2.0))
])

lr_pipe.fit(df_train, df_train[target])

y_pred_lr = lr_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [41]:
report_lr = classification_report(y_test, y_pred_lr)
print(report_lr)

             precision    recall  f1-score   support

          1       0.63      0.63      0.63       552
          2       0.58      0.48      0.53       561
          3       0.61      0.56      0.59       530
          4       0.82      0.89      0.85       563
          5       0.61      0.70      0.65       513
          6       0.58      0.59      0.59       529
          7       0.87      0.86      0.87       532

avg / total       0.67      0.68      0.67      3780



#### cross-validation score

In [46]:
scores_lr = cross_val_score(lr_pipe, df_train, df_train[target], cv=5)
print(scores_lr.round(4))
print(scores_lr.mean().round(4))

[0.6693 0.6789 0.6755 0.6708 0.6614]
0.6712


#### confusion matrix

In [45]:
confusion_lr = confusion_matrix(y_test, y_pred_lr, labels=[1,2,3,4,5,6,7])
print(confusion_lr)

[[349  96   1   0  36   6  64]
 [128 269   9   0 126  24   5]
 [  0   2 297  71  15 145   0]
 [  0   0  30 503   0  30   0]
 [ 11  65  51   0 361  25   0]
 [  0  25  97  42  51 314   0]
 [ 66   5   0   0   2   0 459]]


# 3. Random Forest Pipeline

### Note:

- increase the RF n_estimators from 100 to 500 improves F1 score by 0.01

In [61]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline(steps=[
    ('preprocess', fu),
    ('predict', RandomForestClassifier(n_estimators=500, n_jobs=-1, oob_score=True, random_state=121))
])

rf_pipe.fit(df_train, df_train[target])

y_pred_rf = rf_pipe.predict(df_test)  # get predicted labels

#### F1 score

In [62]:
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

             precision    recall  f1-score   support

          1       0.78      0.74      0.76       552
          2       0.79      0.68      0.73       561
          3       0.86      0.83      0.85       530
          4       0.94      0.97      0.96       563
          5       0.88      0.94      0.91       513
          6       0.85      0.90      0.87       529
          7       0.91      0.96      0.93       532

avg / total       0.86      0.86      0.86      3780



#### cross-validation score

In [63]:
scores_rf = cross_val_score(rf_pipe, df_train, df_train[target], cv=5)
print(scores_rf.round(4))
print(scores_rf.mean().round(4))

[0.8432 0.8533 0.862  0.8522 0.8455]
0.8512


#### confusion matrix

In [64]:
confusion_rf = confusion_matrix(y_test, y_pred_rf, labels=[1,2,3,4,5,6,7])
print(confusion_rf)

[[410  79   0   0  10   3  50]
 [101 379  12   0  49  17   3]
 [  0   3 442  26   3  56   0]
 [  0   0  12 547   0   4   0]
 [  0  14   9   0 483   7   0]
 [  0   4  37   7   3 478   0]
 [ 18   2   0   0   0   0 512]]


# 4. Gradient Boosting Classifier

In [112]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(subsample=.7, n_estimators=100, max_depth=3, 
                                learning_rate=0.01, min_samples_leaf=1, random_state=3)

gb.fit(df_train, df_train[target])

y_pred_gb = gb.predict(df_test)  # get predicted labels

#### F1 score

In [113]:
report_gb = classification_report(y_test, y_pred_gb)
print(report_gb)

             precision    recall  f1-score   support

          1       1.00      1.00      1.00       552
          2       1.00      1.00      1.00       561
          3       1.00      1.00      1.00       530
          4       1.00      1.00      1.00       563
          5       1.00      1.00      1.00       513
          6       1.00      1.00      1.00       529
          7       1.00      1.00      1.00       532

avg / total       1.00      1.00      1.00      3780



#### cross-validation score

In [114]:
scores_gb = cross_val_score(gb, df_train, df_train[target], cv=5)
print(scores_gb.round(4))
print(scores_gb.mean().round(4))

[1. 1. 1. 1. 1.]
1.0


#### confusion matrix

In [115]:
confusion_gb = confusion_matrix(y_test, y_pred_gb, labels=[1,2,3,4,5,6,7])
print(confusion_gb)

[[552   0   0   0   0   0   0]
 [  0 561   0   0   0   0   0]
 [  0   0 530   0   0   0   0]
 [  0   0   0 563   0   0   0]
 [  0   0   0   0 513   0   0]
 [  0   0   0   0   0 529   0]
 [  0   0   0   0   0   0 532]]


In [None]:
parameter_grid = {
    'max_depth': range(1,6),
    'learning_rate': [.01, .05, .1],
    'max_features': [2, 5, 'auto']
}

param_searcher = GridSearchCV(gb, parameter_grid, cv=5)
param_searcher.fit(df_train, df_train[target])

In [None]:
param_searcher.best_params_

In [None]:
param_searcher.best_score_

In [105]:
GradientBoostingClassifier?

# 5. Voting Classifier

In [57]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(estimators =[('knn', knn_pipe), ('lr', lr_pipe), ('rf', rf_pipe)])

vote_clf.fit(df_train, df_train[target])

y_pred_vote = vote_clf.predict(df_test)  # get predicted labels

#### F1 score

In [58]:
report_vote = classification_report(y_test, y_pred_rf)
print(report_vote)

             precision    recall  f1-score   support

          1       0.77      0.74      0.75       552
          2       0.77      0.67      0.72       561
          3       0.85      0.83      0.84       530
          4       0.94      0.97      0.96       563
          5       0.89      0.94      0.91       513
          6       0.84      0.90      0.87       529
          7       0.91      0.96      0.94       532

avg / total       0.85      0.86      0.85      3780



#### cross-validation score

In [59]:
scores_vote = cross_val_score(vote_clf, df_train, df_train[target], cv=5)
print(scores_vote.round(4))
print(scores_vote.mean().round(4))

[0.7935 0.8189 0.8157 0.8058 0.7943]
0.8056


#### confusion matrix

In [60]:
confusion_vote = confusion_matrix(y_test, y_pred_vote, labels=[1,2,3,4,5,6,7])
print(confusion_vote)

[[406  79   1   0  13   4  49]
 [130 316  11   0  81  20   3]
 [  0   3 393  42   6  86   0]
 [  0   0  11 543   0   9   0]
 [  2  22  13   0 465  11   0]
 [  0   8  65  19   5 432   0]
 [ 31   2   0   0   2   0 497]]
