In [1]:
import pandas as pd

data = pd.read_csv("../Datasets/healthcare_dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [2]:
data.isnull().sum()

Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64

In [3]:
data.duplicated().sum()

np.int64(534)

In [4]:
ls = data.columns

ls

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')

In [5]:
data = data.drop_duplicates(ls, keep = 'last')
data.duplicated().sum()

np.int64(0)

In [6]:
data = data.drop(['Name', 'Date of Admission',
                'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date' ], axis = 1)

data.columns

Index(['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Doctor',
       'Hospital', 'Insurance Provider', 'Medication', 'Test Results'],
      dtype='object')

In [7]:
RESULTS = data['Test Results']
RESULTS.unique()

array(['Normal', 'Inconclusive', 'Abnormal'], dtype=object)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54966 entries, 0 to 55499
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 54966 non-null  int64 
 1   Gender              54966 non-null  object
 2   Blood Type          54966 non-null  object
 3   Medical Condition   54966 non-null  object
 4   Doctor              54966 non-null  object
 5   Hospital            54966 non-null  object
 6   Insurance Provider  54966 non-null  object
 7   Medication          54966 non-null  object
 8   Test Results        54966 non-null  object
dtypes: int64(1), object(8)
memory usage: 4.2+ MB


In [9]:
# Label encoding for each column in ls
ls = ['Doctor', 'Hospital', 'Insurance Provider', 'Gender', 'Blood Type', 'Medical Condition', 'Medication', 'Test Results']
label_encoding = data

for col in ls:
    label_encoding[col] = label_encoding[col].astype("category").cat.codes

label_encoding.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Doctor,Hospital,Insurance Provider,Medication,Test Results
0,30,1,5,2,26612,29933,1,3,2
1,62,1,0,5,33648,16012,3,1,1
2,76,0,1,5,37828,5473,0,0,2
3,28,0,6,3,22511,12317,3,1,0
4,43,0,2,2,21259,33598,0,4,0


In [10]:
from sklearn.model_selection import train_test_split

X = data.drop("Test Results", axis = 1)
y = data["Test Results"]

# Split data
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

for i in range (2, 16):

    clff = DecisionTreeClassifier(criterion="gini", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Decision Tree ('gini') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100, "%")
    
print('-' *50)

for i in range (2, 16):

    clff = DecisionTreeClassifier(criterion="entropy", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Decision Tree ('entropy') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100, "%")

print('-' *50)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy of Decision Tree ('gini') at depth=2: 33.0 %
Accuracy of Decision Tree ('gini') at depth=3: 33.0 %
Accuracy of Decision Tree ('gini') at depth=4: 33.0 %
Accuracy of Decision Tree ('gini') at depth=5: 33.0 %
Accuracy of Decision Tree ('gini') at depth=6: 34.0 %
Accuracy of Decision Tree ('gini') at depth=7: 34.0 %
Accuracy of Decision Tree ('gini') at depth=8: 34.0 %
Accuracy of Decision Tree ('gini') at depth=9: 34.0 %
Accuracy of Decision Tree ('gini') at depth=10: 34.0 %
Accuracy of Decision Tree ('gini') at depth=11: 35.0 %
Accuracy of Decision Tree ('gini') at depth=12: 35.0 %
Accuracy of Decision Tree ('gini') at depth=13: 35.0 %
Accuracy of Decision Tree ('gini') at depth=14: 35.0 %
Accuracy of Decision Tree ('gini') at depth=15: 35.0 %
--------------------------------------------------
Accuracy of Decision Tree ('entropy') at depth=2: 33.0 %
Accuracy of Decision Tree ('entropy') at depth=3: 33.0 %
Accuracy of Decision Tree ('entropy') at depth=4: 33.0 %
Accuracy of Deci

In [12]:
for i in range (2, 16):

    clff = RandomForestClassifier(criterion="gini", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Random Forest ('gini') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100 , "%")
    
print('-' *50)

for i in range (2, 16):

    clff = RandomForestClassifier(criterion="entropy", max_depth=i, random_state=42)
    clff.fit(X_train, y_train)

    # Evaluate Model
    y_pred = clff.predict(X_test)

    print(f"Accuracy of Random Forest ('entropy') at depth={i}:", round(accuracy_score(y_test, y_pred), 2)*100, "%")

print('-' *50)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy of Random Forest ('gini') at depth=2: 34.0 %
Accuracy of Random Forest ('gini') at depth=3: 34.0 %
Accuracy of Random Forest ('gini') at depth=4: 34.0 %
Accuracy of Random Forest ('gini') at depth=5: 34.0 %
Accuracy of Random Forest ('gini') at depth=6: 34.0 %
Accuracy of Random Forest ('gini') at depth=7: 35.0 %
Accuracy of Random Forest ('gini') at depth=8: 36.0 %
Accuracy of Random Forest ('gini') at depth=9: 35.0 %
Accuracy of Random Forest ('gini') at depth=10: 37.0 %
Accuracy of Random Forest ('gini') at depth=11: 37.0 %
Accuracy of Random Forest ('gini') at depth=12: 38.0 %
Accuracy of Random Forest ('gini') at depth=13: 39.0 %
Accuracy of Random Forest ('gini') at depth=14: 40.0 %
Accuracy of Random Forest ('gini') at depth=15: 41.0 %
--------------------------------------------------
Accuracy of Random Forest ('entropy') at depth=2: 34.0 %
Accuracy of Random Forest ('entropy') at depth=3: 34.0 %
Accuracy of Random Forest ('entropy') at depth=4: 34.0 %
Accuracy of Rand