## **Imports** 

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
diab_df=pd.read_csv('/Users/vladandreichuk/Desktop/Comparative-Analysis-of-ML-Algorithms-Predicting-Hospital-Readmission-of-Diabetes-Patients/CP_Hospital_Readmission/diab_df_filtered.csv')

In [6]:
diab_df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,outliers,outpatient,emergency
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,NO,False,0,0
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,Up,No,Ch,Yes,>30,False,0,0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,Yes,NO,False,1,0
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,Up,No,Ch,Yes,NO,False,0,0
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,Steady,No,Ch,Yes,NO,False,0,0


In [27]:
diab_df['race'].value_counts()

race
Caucasian          78372
AfricanAmerican    19210
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

In [22]:
diab_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      101766 non-null  object
 1   gender                    101766 non-null  object
 2   age                       101766 non-null  object
 3   admission_type_id         101766 non-null  int64 
 4   discharge_disposition_id  101766 non-null  int64 
 5   admission_source_id       101766 non-null  int64 
 6   time_in_hospital          101766 non-null  int64 
 7   num_lab_procedures        101766 non-null  int64 
 8   num_procedures            101766 non-null  int64 
 9   num_medications           101766 non-null  int64 
 10  number_inpatient          101766 non-null  int64 
 11  diag_1                    101766 non-null  object
 12  diag_2                    101766 non-null  object
 13  diag_3                    101766 non-null  object
 14  numb

## **Logistic Regression** 

In [11]:
# Separate the target variable from the features
X = diab_df.drop(columns=['readmitted'])  # Features
y = diab_df['readmitted']  # Target variable

# Format all fields as string
#X_str = X.astype(str)

# Encode input data
def encode_inputs(X):
    oe = OrdinalEncoder()
    oe.fit(X)
    X_enc = oe.transform(X)
    return X_enc


# Encode target variable
def encode_targets(y):
    le = LabelEncoder()
    le.fit(y)
    y_enc = le.transform(y)
    return y_enc

# Transform X
X_enc = encode_inputs(X)

# Transform y
y_enc = encode_targets(y)

In [47]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_enc)


# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_enc, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Set up a grid of hyperparameters to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': [None, 'l2'],  # Regularization type
    'solver': ['newton-cg', 'sag', 'saga','lbfgs'],
}

# Perform grid search with stratified 5-fold cross-validation
cv = StratifiedKFold(n_splits=10)
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')

grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = LogisticRegression(**best_params, max_iter=1000)


best_model.fit(X_train, y_train)



Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None wil

KeyboardInterrupt: 

In [40]:
best_params 

{'C': 0.001, 'penalty': 'l2', 'solver': 'newton-cg'}

In [43]:
# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Get class names
class_names = best_model.classes_.astype(str)

# Display the classification report
report = classification_report(y_test, y_pred, target_names=class_names)

print(report)

              precision    recall  f1-score   support

           0       0.33      0.01      0.01      2285
           1       0.50      0.25      0.33      7117
           2       0.59      0.90      0.71     10952

    accuracy                           0.57     20354
   macro avg       0.47      0.38      0.35     20354
weighted avg       0.53      0.57      0.50     20354



In [44]:
# Predict on the validation set
#y_val_pred = best_model.predict(X_val)

# Predict probabilities for ROC-AUC
#y_val_prob = best_model.predict_proba(X_val)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


In [None]:
#roc_auc = roc_auc_score(y_val, y_val_prob, multi_class='ovr', average='macro')

# Initialize variables to store fpr, tpr, and roc_auc for each class
n_classes = y_val_prob.shape[1]
fpr = {}
tpr = {}
roc_auc = {}

# Calculate ROC-AUC for each class
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val, y_val_prob[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

In [45]:
# Print the evaluation metrics and best hyperparameters
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
# Print macro average ROC-AUC
#print("Macro Average ROC-AUC:", roc_auc)

Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy: 0.5698634175100717
Precision: 0.472497924758749
Recall: 0.3834248199628278
F1 Score: 0.35025229116989776
