# RANDOM FOREST

## Import Python Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## Load the dataset

In [None]:
# Load the dataset
file_path = 'diabetic_data_clean.csv'  # Replace with the path to your dataset
data = pd.read_csv(file_path)


## Display First 10 rows

In [None]:

data.head(10)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,15738,63555939,2,0,10,3,3,4,12,18,...,0,2,1,0,0,0,0,0,1,2
1,40926,85504905,2,0,5,1,3,7,7,11,...,0,1,1,0,0,0,0,0,1,0
2,42570,77586282,2,1,9,1,6,7,10,11,...,0,2,1,0,0,0,0,1,1,2
3,84222,108662661,2,0,6,1,1,7,3,3,...,0,0,1,0,0,0,0,1,1,2
4,236316,40523301,2,1,9,1,3,7,6,3,...,0,0,1,0,0,0,0,0,1,2
5,248916,115196778,2,0,6,1,1,1,2,62,...,0,2,1,0,0,0,0,1,1,1
6,252822,18196434,2,0,9,1,2,7,5,3,...,0,0,1,0,0,0,0,0,1,1
7,260166,80845353,2,0,8,1,1,7,6,11,...,0,2,1,0,0,0,0,1,1,1
8,325848,63023292,2,0,7,1,1,7,2,3,...,0,1,1,0,0,0,0,0,1,1
9,383430,80588529,2,0,8,1,2,7,1,3,...,0,1,1,0,0,0,0,0,1,1


## Step 1: Inspect the dataset

In [None]:
# Step 1: Inspect the dataset
print("Dataset Preview:")
print(data.head())
print("\nDataset Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())

Dataset Preview:
   encounter_id  patient_nbr       race  gender  age  admission_type_id  \
0         15738     63555939  Caucasian  Female   10                  3   
1         40926     85504905  Caucasian  Female    5                  1   
2         42570     77586282  Caucasian    Male    9                  1   
3         84222    108662661  Caucasian  Female    6                  1   
4        236316     40523301  Caucasian    Male    9                  1   

   discharge_disposition_id  admission_source_id  time_in_hospital  \
0                         3                    4                12   
1                         3                    7                 7   
2                         6                    7                10   
3                         1                    7                 3   
4                         3                    7                 6   

        medical_specialty  ...  citoglipton  insulin  glyburide-metformin  \
0        InternalMedicine  ...    

## Step 2: Handle missing values

In [None]:
 # Step 2: Handle missing values
# Filling missing values with mean for numerical columns and mode for categorical columns
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].mean(), inplace=True)

print("\nMissing Values After Imputation:")
print(data.isnull().sum())


Missing Values After Imputation:
encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglit


## Step 3: Encode categorical variables

In [None]:

# Step 3: Encode categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

print("\nDataset After Encoding:")
print(data.head())


Dataset After Encoding:
   encounter_id  patient_nbr  race  gender  age  admission_type_id  \
0         15738     63555939     2       0   10                  3   
1         40926     85504905     2       0    5                  1   
2         42570     77586282     2       1    9                  1   
3         84222    108662661     2       0    6                  1   
4        236316     40523301     2       1    9                  1   

   discharge_disposition_id  admission_source_id  time_in_hospital  \
0                         3                    4                12   
1                         3                    7                 7   
2                         6                    7                10   
3                         1                    7                 3   
4                         3                    7                 6   

   medical_specialty  ...  citoglipton  insulin  glyburide-metformin  \
0                 18  ...            0        2              

## Step 4: Feature-target split

In [None]:

target_column = 'insulin'
if target_column not in data.columns:
    raise ValueError(f"The target column '{target_column}' is not in the dataset.")

X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target

In [None]:
X

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,examide,citoglipton,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,15738,63555939,2,0,10,3,3,4,12,18,...,0,0,1,0,0,0,0,0,1,2
1,40926,85504905,2,0,5,1,3,7,7,11,...,0,0,1,0,0,0,0,0,1,0
2,42570,77586282,2,1,9,1,6,7,10,11,...,0,0,1,0,0,0,0,1,1,2
3,84222,108662661,2,0,6,1,1,7,3,3,...,0,0,1,0,0,0,0,1,1,2
4,236316,40523301,2,1,9,1,3,7,6,3,...,0,0,1,0,0,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48916,443739044,106595208,2,1,8,2,6,7,6,8,...,0,0,1,0,0,0,0,0,1,2
48917,443793668,47293812,2,1,9,1,13,7,3,8,...,0,0,1,0,0,0,0,0,1,2
48918,443804570,33230016,2,0,8,1,22,7,8,18,...,0,0,1,0,0,0,0,1,1,1
48919,443816024,106392411,2,0,8,3,6,1,3,27,...,0,0,1,0,0,0,0,0,1,2


In [None]:
y

0        2
1        1
2        2
3        0
4        0
        ..
48916    3
48917    1
48918    2
48919    2
48920    3
Name: insulin, Length: 48921, dtype: int64

## Step 5: Split data into training and testing sets

In [None]:
# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("\nTraining and Testing Set Sizes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")



Training and Testing Set Sizes:
X_train: (39136, 45), X_test: (9785, 45), y_train: (39136,), y_test: (9785,)


## Step 6: Train a Random Forest model

In [None]:
# Step 6: Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

## Step 7: Evaluate the model

In [None]:
# Step 7: Evaluate the model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")


Model Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      4471
           1       0.50      0.51      0.50      1251
           2       0.75      0.85      0.80      2977
           3       0.45      0.31      0.36      1086

    accuracy                           0.78      9785
   macro avg       0.66      0.65      0.65      9785
weighted avg       0.77      0.78      0.77      9785

Accuracy Score: 0.7750638732754216


# The Random Forest model achieves an accuracy score of 77.5%

Key Metrics

Precision: The ratio of correctly predicted positive observations to total predicted positives.

High for class 0 (93%) and class 2 (75%).

Lower for class 1 (50%) and class 3 (45%).

Recall: The ratio of correctly predicted positive observations to all observations in the actual class.

    
