In [20]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report
from sklearn.model_selection import cross_val_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

## Load Data

In [8]:
df = pd.read_csv('processed/covtype_categorical_small.csv')
labels = df['Cover_Type']
df = df.drop(['Cover_Type'],axis = 1)
data = df.drop(df.columns[[10,11]],axis = 1)
scaler = MinMaxScaler()
scaled = scaler.fit(data).transform(data)
X = pd.DataFrame(scaled)
X.columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
X['Wilderness_Area'] = df['Wilderness_Area']
X['Soil_Type'] = df['Soil_Type']

# Drop Features
X = X.drop(['Aspect', 'Slope', 'Wilderness_Area','Soil_Type'],axis = 1)

## Data Split

In [4]:
X_train,X_test, Y_train,Y_test = train_test_split(X,labels,test_size = 0.2)

## Classification
>Gradient Boosting

In [25]:
clf = GradientBoostingClassifier().fit(X_train,Y_train)

In [26]:
predictions = clf.predict(X_test)
acc = accuracy_score(Y_test,predictions)
print ('Accuracy: ',acc)
report = classification_report(Y_test,predictions)
print ('\nReport: \n',report)

Accuracy:  0.750600242679

Report: 
              precision    recall  f1-score   support

          1       0.73      0.74      0.74     42575
          2       0.76      0.82      0.79     56536
          3       0.74      0.76      0.75      7103
          4       0.81      0.65      0.72       514
          5       0.77      0.16      0.26      1895
          6       0.64      0.35      0.46      3457
          7       0.84      0.52      0.64      4123

avg / total       0.75      0.75      0.74    116203



> AdaBoost

In [17]:
clf = AdaBoostClassifier(n_estimators = 100).fit(X_train,Y_train)

In [18]:
predictions = clf.predict(X_test)
acc = accuracy_score(Y_test,predictions)
print ('Accuracy: ',acc)
report = classification_report(Y_test,predictions)
print ('\nReport: \n',report)

Accuracy:  0.592024302299

Report: 
              precision    recall  f1-score   support

          1       0.65      0.62      0.64     42575
          2       0.72      0.62      0.66     56536
          3       0.41      0.92      0.56      7103
          4       0.00      0.00      0.00       514
          5       0.00      0.00      0.00      1895
          6       0.00      0.00      0.00      3457
          7       0.08      0.21      0.12      4123

avg / total       0.61      0.59      0.59    116203



  'precision', 'predicted', average, warn_for)


In [21]:
clf = BaggingClassifier().fit(X_train,Y_train)

In [22]:
predictions = clf.predict(X_test)
acc = accuracy_score(Y_test,predictions)
print ('Accuracy: ',acc)
report = classification_report(Y_test,predictions)
print ('\nReport: \n',report)

Accuracy:  0.954329922635

Report: 
              precision    recall  f1-score   support

          1       0.95      0.96      0.96     42575
          2       0.96      0.96      0.96     56536
          3       0.93      0.95      0.94      7103
          4       0.90      0.82      0.85       514
          5       0.93      0.79      0.86      1895
          6       0.94      0.86      0.90      3457
          7       0.98      0.94      0.96      4123

avg / total       0.95      0.95      0.95    116203



###### Conclusions

> Random Forest works best on this. 

> Accuracy = 96.01 %

> - Model fitted after dropping less important features such as Wilderness Area, Slope, Aspect. But this does not affect the overall accuracy. The effect is less than 1 per cent

> - Model fitted after dropping the two categorical variables i.e wilderness area and soil type. This reduces the accuracy by 2-3 % at most since soil_type is important 
