In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report
from sklearn.model_selection import cross_val_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

## Load Data

In [45]:
df = pd.read_csv('processed/covtype_categorical_small.csv')
labels = df['Cover_Type']
df = df.drop(['Cover_Type'],axis = 1)
data = df.drop(df.columns[[10,11]],axis = 1)
scaler = MinMaxScaler()
scaled = scaler.fit(data).transform(data)
X = pd.DataFrame(scaled)
X.columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
X['Wilderness_Area'] = df['Wilderness_Area']
X['Soil_Type'] = df['Soil_Type']

# Drop Features
X = X.drop(['Aspect', 'Slope', 'Wilderness_Area'],axis = 1)

## Data Split

In [46]:
X_train,X_test, Y_train,Y_test = train_test_split(X,labels,test_size = 0.2)

## Classification
> Decision Tree

In [61]:
tree_model = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth = 30).fit(X_train,Y_train)
tree_data = tree.export_graphviz(tree_model,out_file = None,
                         feature_names=list(X),    
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(tree_data)
# graph

In [58]:
predictions = tree_model.predict(X_test)
acc = accuracy_score(Y_test,predictions)
print ('Accuracy: ',acc)
report = classification_report(Y_test,predictions)
print ('\nReport: \n',report)

Accuracy:  0.941404266671

Report: 
              precision    recall  f1-score   support

          1       0.94      0.94      0.94     42415
          2       0.95      0.95      0.95     56726
          3       0.93      0.93      0.93      7090
          4       0.84      0.82      0.83       523
          5       0.85      0.85      0.85      1894
          6       0.89      0.87      0.88      3442
          7       0.96      0.94      0.95      4113

avg / total       0.94      0.94      0.94    116203



###### Conclusions

> Accuracy = 69 % at max_depth = 5

> Accuracy = 76 % at max_depth = 10

> Accuracy = 85 % at max_depth = 15

> Accuracy = 91.7 % at max_depth = 20

> Accuracy = 94.1 % at max_depth = 30


> - Increasing depth increases accuracy.

> - Model fitted after dropping less important features such as Wilderness Area, Slope, Aspect. But this does not affect the overall accuracy. The effect is less than 1 per cent

> - Model fitted after dropping the two categorical variables i.e wilderness area and soil type. This reduces the accuracy by 2-3 % at most since soil_type is important 
