# **2. Implement Decision Tree with Hyper Parameter Tuning**

In [12]:
# Importing necessary libraries.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
# Reading the csv file into dataframe DF. Then printing the first five rows.
DF = pd.read_csv("/content/anemia.csv")
DF.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [6]:
# Checking the DF for null values and datatypes.
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   int64  
 1   Hemoglobin  1421 non-null   float64
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 66.7 KB


In [7]:
# Printing the min, max, mean, median(50%), Q1(25%), Q3(75%), standard deviation of each feature.
DF.describe()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
count,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0
mean,0.52076,13.412738,22.90563,30.251232,85.523786,0.436312
std,0.499745,1.974546,3.969375,1.400898,9.636701,0.496102
min,0.0,6.6,16.0,27.8,69.4,0.0
25%,0.0,11.7,19.4,29.0,77.3,0.0
50%,1.0,13.2,22.7,30.4,85.3,0.0
75%,1.0,15.0,26.2,31.4,94.2,1.0
max,1.0,16.9,30.0,32.5,101.6,1.0


In [8]:
# Spliting the data into training and testing set.
X = DF.iloc[:, 0:5]
y = DF.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initializing the Decision Tree Classifier.
dtc = DecisionTreeClassifier(random_state=42)

In [13]:
# Defining the parameter grid for Grid Search. Then, Initializing and fitting in GridSearchCV.
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


In [14]:
# Getting the best parameters.
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


In [15]:
# Training and predicting with the best parameters used in Decision-Tree-Classifier.
best_dtc = grid_search.best_estimator_
best_dtc.fit(X_train, y_train)

y_pred = best_dtc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
confusion_matrix(y_test, y_pred)

Accuracy: 1.0


array([[157,   0],
       [  0, 128]])