In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report
from sklearn.model_selection import cross_val_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

## Load Data

In [2]:
df = pd.read_csv('processed/covtype_categorical_small.csv')
labels = df['Cover_Type']
df = df.drop(['Cover_Type'],axis = 1)
data = df.drop(df.columns[[10,11]],axis = 1)
scaler = MinMaxScaler()
scaled = scaler.fit(data).transform(data)
X = pd.DataFrame(scaled)
X.columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
X['Wilderness_Area'] = df['Wilderness_Area']
X['Soil_Type'] = df['Soil_Type']

# Drop Features
X = X.drop(['Aspect', 'Slope', 'Wilderness_Area'],axis = 1)

## Data Split

In [3]:
X_train,X_test, Y_train,Y_test = train_test_split(X,labels,test_size = 0.2)

## Classification
> Random Forest

In [4]:
gnb = GaussianNB().fit(X_train,Y_train)

In [5]:
predictions = gnb.predict(X_test)
acc = accuracy_score(Y_test,predictions)
print ('Accuracy: ',acc)
report = classification_report(Y_test,predictions)
print ('\nReport: \n',report)

Accuracy:  0.625087132002

Report: 
              precision    recall  f1-score   support

          1       0.60      0.70      0.65     42732
          2       0.74      0.59      0.65     56396
          3       0.56      0.83      0.67      7055
          4       0.49      0.32      0.39       541
          5       0.21      0.22      0.22      1886
          6       0.31      0.25      0.28      3475
          7       0.37      0.58      0.45      4118

avg / total       0.64      0.63      0.63    116203



###### Conclusions

> Accuracy = 62.01 %

> - Model fitted after dropping less important features such as Wilderness Area, Slope, Aspect. But this does not affect the overall accuracy. The effect is less than 1 per cent

> - Model fitted after dropping the two categorical variables i.e wilderness area and soil type. This reduces the accuracy by 2-3 % at most since soil_type is important 
