In [1]:
import pandas as pd

data = pd.read_csv("../Datasets/drug200.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [2]:
data.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [3]:
data.duplicated().sum()

np.int64(0)

In [4]:
data.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [5]:
DRUGS = data["Drug"]
DRUGS.unique()

array(['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype=object)

In [6]:
# Label encoding for each column in ls
ls = ['Sex', 'BP', 'Cholesterol', 'Drug']
label_encoding = data

for col in ls:
    label_encoding[col] = label_encoding[col].astype("category").cat.codes

label_encoding.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,0
1,47,1,1,0,13.093,3
2,47,1,1,0,10.114,3
3,28,0,2,0,7.798,4
4,61,0,1,0,18.043,0


In [7]:
from sklearn.model_selection import train_test_split

X = label_encoding.drop("Drug", axis = 1)
y = label_encoding["Drug"]

# Split data
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

for i in range (2, 6):

    clff = DecisionTreeClassifier(criterion="gini", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Decision Tree ('gini') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100, "%")
    
print('-' *50)

for i in range (2, 6):

    clff = DecisionTreeClassifier(criterion="entropy", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Decision Tree ('entropy') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100, "%")

print('-' *50)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy of Decision Tree ('gini') at depth=2: 82.0 %
Accuracy of Decision Tree ('gini') at depth=3: 90.0 %
Accuracy of Decision Tree ('gini') at depth=4: 97.0 %
Accuracy of Decision Tree ('gini') at depth=5: 97.0 %
--------------------------------------------------
Accuracy of Decision Tree ('entropy') at depth=2: 82.0 %
Accuracy of Decision Tree ('entropy') at depth=3: 90.0 %
Accuracy of Decision Tree ('entropy') at depth=4: 97.0 %
Accuracy of Decision Tree ('entropy') at depth=5: 97.0 %
--------------------------------------------------

Confusion Matrix:
 [[18  0  0  0  0]
 [ 0  5  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  3  0]
 [ 1  0  0  0 10]]


In [9]:
for i in range (2, 6):

    clff = RandomForestClassifier(criterion="gini", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Random Forest ('gini') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100 , "%")
    
print('-' *50)

for i in range (2, 6):

    clff = RandomForestClassifier(criterion="entropy", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Random Forest ('entropy') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100, "%")

print('-' *50)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy of Random Forest ('gini') at depth=2: 90.0 %
Accuracy of Random Forest ('gini') at depth=3: 90.0 %
Accuracy of Random Forest ('gini') at depth=4: 97.0 %
Accuracy of Random Forest ('gini') at depth=5: 97.0 %
--------------------------------------------------
Accuracy of Random Forest ('entropy') at depth=2: 90.0 %
Accuracy of Random Forest ('entropy') at depth=3: 90.0 %
Accuracy of Random Forest ('entropy') at depth=4: 97.0 %
Accuracy of Random Forest ('entropy') at depth=5: 97.0 %
--------------------------------------------------

Confusion Matrix:
 [[18  0  0  0  0]
 [ 0  5  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  3  0]
 [ 1  0  0  0 10]]
