In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report
from sklearn.model_selection import cross_val_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

## Load Data

In [10]:
df = pd.read_csv('processed/covtype_categorical_small.csv')
labels = df['Cover_Type']
df = df.drop(['Cover_Type'],axis = 1)
data = df.drop(df.columns[[10,11]],axis = 1)
scaler = MinMaxScaler()
scaled = scaler.fit(data).transform(data)
X = pd.DataFrame(scaled)
X.columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
X['Wilderness_Area'] = df['Wilderness_Area']
X['Soil_Type'] = df['Soil_Type']

# Drop Features
X = X.drop(['Aspect', 'Slope', 'Wilderness_Area'],axis = 1)

## Data Split

In [11]:
X_train,X_test, Y_train,Y_test = train_test_split(X,labels,test_size = 0.2)

## Classification
> Random Forest

In [12]:
clf = RandomForestClassifier().fit(X_train,Y_train)

In [14]:
predictions = clf.predict(X_test)
acc = accuracy_score(Y_test,predictions)
print ('Accuracy: ',acc)
report = classification_report(Y_test,predictions)
print ('\nReport: \n',report)

Accuracy:  0.960181750901

Report: 
              precision    recall  f1-score   support

          1       0.96      0.97      0.96     42193
          2       0.97      0.97      0.97     56863
          3       0.95      0.96      0.95      7203
          4       0.90      0.89      0.89       548
          5       0.94      0.81      0.87      1828
          6       0.94      0.89      0.92      3478
          7       0.98      0.95      0.96      4090

avg / total       0.96      0.96      0.96    116203



###### Conclusions

> Random Forest works best on this. 

> Accuracy = 96.01 %

> - Model fitted after dropping less important features such as Wilderness Area, Slope, Aspect. But this does not affect the overall accuracy. The effect is less than 1 per cent

> - Model fitted after dropping the two categorical variables i.e wilderness area and soil type. This reduces the accuracy by 2-3 % at most since soil_type is important 
