# Implementation of all decision tree models and optimizations (using the already split datasets)

# Library requirements

In [None]:
!pip install disarray

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculationort pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from matplotlib.projections.polar import np
import disarray
import json
import csv

# Reading train and test data

In [None]:
X_train = pd.read_csv("./Data/X_train.csv")
X_train

In [None]:
y_train = pd.read_csv("./Data/y_train.csv")
y_train

In [None]:
X_test = pd.read_csv("./Data/X_test.csv")
X_test

In [None]:
y_test = pd.read_csv("./Data/y_test.csv")
y_test

In [None]:
# getting the labels and all the features
label_names = np.sort(y_train['class'].unique())
feature_names = X_train.columns
print("label_names: ", label_names)
print("feature_names: ", feature_names)

# Training and testing non-optimized decision tree

In [None]:
# create decision tree classifer object
clf_non_optimized = DecisionTreeClassifier()

# train the decision tree classifer
clf_non_optimized = clf_non_optimized.fit(X_train, y_train)

# predict the response for test dataset
y_pred_non_optimized = clf_non_optimized.predict(X_test)

In [None]:
# calculating accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_non_optimized)
print("Accuracy:", accuracy)

In [None]:
text = 'This is the accuracy for the non optimized decision tree'

with open('non_optimized_results.csv','w') as f:
    f.write(text)
    f.write("\n"+str(accuracy)+"\n")


In [None]:
# making the confusion matrix for the results for non-optimized decision tree
c_matrix = confusion_matrix(y_test, y_pred_non_optimized)
c_matrix

In [None]:
df_c_matrix = pd.DataFrame(c_matrix)
df_c_matrix

In [None]:
text = 'These are the confusion matrix values. True labels along y-axis and predicted labels along x-axis'

with open('non_optimized_results.csv','a') as f:
    f.write("\n" + text + "\n")

df_c_matrix.to_csv("non_optimized_results.csv", mode='a')

In [None]:
# calculating precision, recall, and f1
df = pd.DataFrame(c_matrix, index= ['0', '1','2'],columns=['0', '1','2'])
imp_metrics = df.da.export_metrics(metrics_to_include=['precision', 'recall', 'f1'])
imp_metrics

In [None]:
text = 'These are the 1. precision 2. recall and 3. f1 scores'

with open('non_optimized_results.csv','a') as f:
    f.write("\n" + text +"\n")

imp_metrics.to_csv('non_optimized_results.csv', mode='a')

In [None]:
confusion_matrix_display = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_non_optimized)
confusion_matrix_display.figure_.savefig('confusion_matrix_non_optimized.png')

# Tree visualization and feature analysis for non-optimized decision tree

In [None]:
# textual representation of the decision tree
text_representation = tree.export_text(clf_non_optimized)
print(text_representation)

In [None]:
# visual representation of the decision tree

label_names_str = [item for item in label_names.astype(str)]
figure_tree = plt.figure(figsize=(100,100))
_ = tree.plot_tree(clf_non_optimized,
                   feature_names=feature_names,
                   class_names=label_names_str,
                   filled=True)
figure_tree.savefig("decision_tree_non_optimized.png")

In [None]:
# calculating feature importance scores
feat_importance = clf_non_optimized.feature_importances_
print("Feature Importance = " + str(feat_importance))

In [None]:
# getting the top features
top_feature_names = []
for i in range(len(feature_names)):
  if feat_importance[i] > 0.006:
    top_feature_names.append(feature_names[i])

top_feature_names = np.array(top_feature_names)

print(top_feature_names)

In [None]:
# visual representation of the feature importance scores
feat_importances = pd.DataFrame(clf_non_optimized.feature_importances_, index=feature_names)
plot = feat_importances.plot(kind='bar', figsize=(6,4))
fig = plot.get_figure()
fig.savefig("feature_importances_non_optimized.png")

# Training and testing optimized decision tree (after hyperparameter tuning)

In [None]:
# all the hyperparameters to be tuned and a range of value for them
tree_para = {'criterion':['gini','entropy','log_loss'],'splitter':['best', 'random'], 'max_depth':np.arange(1,10), 'max_leaf_nodes':np.arange(2,10)}

# create decision tree classifer object
clf_optimized = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)

# train the decision tree classifer
clf_optimized = clf_optimized.fit(X_train,y_train)

# the most optimal values found for the hyperparameters
print(clf_optimized.best_params_)

# predict the response for test dataset
y_pred_optimized = clf_optimized.predict(X_test)

In [None]:
# calculating accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_optimized)
print("Accuracy:",accuracy)

In [None]:
text = 'This is the accuracy for the optimized decision tree'

with open('optimized_results.csv','w') as f:
    f.write(text)
    f.write("\n"+str(accuracy)+"\n")


In [None]:
# making the confusion matrix for the results for non-optimized decision tree

c_matrix = confusion_matrix(y_test, y_pred_optimized)
c_matrix

In [None]:
df_c_matrix = pd.DataFrame(c_matrix)
df_c_matrix

In [None]:
text = 'These are the confusion matrix values. True labels along y-axis and predicted labels along x-axis'

with open('optimized_results.csv','a') as f:
    f.write("\n" + text + "\n")

df_c_matrix.to_csv('optimized_results.csv', mode='a')

In [None]:
# calculating precision, recall, and f1

df = pd.DataFrame(c_matrix, index= ['0', '1','2'],columns=['0', '1','2'])
imp_metrics = df.da.export_metrics(metrics_to_include=['precision', 'recall', 'f1'])
imp_metrics

In [None]:
text = 'These are the 1. precision 2. recall and 3. f1 scores'

with open('optimized_results.csv','a') as f:
    f.write("\n" + text +"\n")

imp_metrics.to_csv('optimized_results.csv', mode='a')

In [None]:
confusion_matrix_display = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_optimized)
confusion_matrix_display.figure_.savefig('confusion_matrix_optimized.png')

# Tree visualization and feature analysis for optimized decision tree

In [None]:
# textual representation of the optimized decision tree
text_representation = tree.export_text(clf_optimized.best_estimator_)
print(text_representation)

In [None]:
# visual representation of the optimized decision tree

figure_tree = plt.figure(figsize=(50,50))
label_names_str = [item for item in label_names.astype(str)]
_ = tree.plot_tree(clf_optimized.best_estimator_,
                   feature_names=feature_names,
                   class_names=label_names_str,
                   filled=True)
figure_tree.savefig("decision_tree_optimizied.png")

In [None]:
# calculating feature importance scores
feat_importance = clf_optimized.best_estimator_.feature_importances_
print("Feature Importance = " + str(feat_importance))

In [None]:
# visual representation of the feature importance scores
feat_importances = pd.DataFrame(clf_optimized.best_estimator_.feature_importances_, index=feature_names)
plot = feat_importances.plot(kind='bar', figsize=(6,4))
fig = plot.get_figure()
fig.savefig("feature_importances_optimized.png")

In [None]:
# getting the top features
top_feature_names = []
for i in range(len(feature_names)):
  if feat_importance[i] > 0:
    top_feature_names.append(feature_names[i])

top_feature_names = np.array(top_feature_names)

print(top_feature_names)

# Training and testing optimized and non-optimized models after removing top three features

In [None]:
# deleting the top features of the optimized decision tree from the training data
X_train_delete_features = X_train.drop(top_feature_names, axis=1)
X_train_delete_features

In [None]:
# deleting the top features of the optimized decision tree from the test data
X_test_delete_features = X_test.drop(top_feature_names, axis=1)
X_test_delete_features

In [None]:
# getting the remaining features
delete_feature_names = X_test_delete_features.columns

In [None]:
delete_feature_names

In [None]:
# create decision tree classifer object
clf_non_optimized_delete_features = DecisionTreeClassifier()

# train decision tree classifer
clf_non_optimized_delete_features = clf_non_optimized_delete_features.fit(X_train_delete_features, y_train)

# predict the response for test dataset
y_pred_non_optimized_delete_features = clf_non_optimized_delete_features.predict(X_test_delete_features)

In [None]:
# calculating accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_non_optimized_delete_features))

In [None]:
# textual represntation of the decision tree for the model trained after deleting top features
text_representation = tree.export_text(clf_non_optimized_delete_features)
print(text_representation)

In [None]:
# visual represntation of the decision tree for the model trained after deleting top features

figure_tree = plt.figure(figsize=(100,100))
label_names_str = [item for item in label_names.astype(str)]
_ = tree.plot_tree(clf_non_optimized_delete_features,
                   feature_names=delete_feature_names,
                   class_names=label_names_str,
                   filled=True)
figure_tree.savefig("decision_tree_non_optimizied_without3bestfeat.png")

In [None]:
# hyperparameter tuning for the model trained after deleting top features

tree_para = {'criterion':['gini','entropy','log_loss'],'splitter':['best', 'random'], 'max_depth':np.arange(1,10), 'max_leaf_nodes':np.arange(2,10)}

# create decision tree classifer object
clf_optimized_delete_features = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)

# train decision tree classifer
clf_optimized_delete_features = clf_optimized_delete_features.fit(X_train_delete_features,y_train)
print(clf_optimized_delete_features.best_params_)

# predict the response for test dataset
y_pred_optimized_delete_features = clf_optimized_delete_features.predict(X_test_delete_features)

# calculating accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_optimized_delete_features))

In [None]:
# textual represntation of the decision tree for the model trained after deleting top features

text_representation = tree.export_text(clf_optimized_delete_features.best_estimator_)
print(text_representation)

In [None]:
# visual represntation of the decision tree for the model trained after deleting top features

figure_tree = plt.figure(figsize=(50,50))
label_names_str = [item for item in label_names.astype(str)]
_ = tree.plot_tree(clf_optimized_delete_features.best_estimator_,
                   feature_names=delete_feature_names,
                   class_names=label_names_str,
                   filled=True)
figure_tree.savefig("decision_tree_optimizied_without3bestfeat.png")