# Part D cleaning the dataset

## Step 1 Load the dataset

In [9]:
import pandas as pd

In [11]:
df = pd.read_csv('medical_clean.csv')
df


Unnamed: 0,CaseOrder,Customer_id,Interaction,UID,City,State,County,Zip,Lat,Lng,...,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
0,1,C412403,8cd49b13-f45a-4b47-a2bd-173ffa932c2f,3a83ddb66e2ae73798bdf1d705dc0932,Eva,AL,Morgan,35621,34.34960,-86.72508,...,3726.702860,17939.403420,3,3,2,2,4,3,3,4
1,2,Z919181,d2450b70-0337-4406-bdbb-bc1037f1734c,176354c5eef714957d486009feabf195,Marianna,FL,Jackson,32446,30.84513,-85.22907,...,4193.190458,17612.998120,3,4,3,4,4,4,3,3
2,3,F995323,a2057123-abf5-4a2c-abad-8ffe33512562,e19a0fa00aeda885b8a436757e889bc9,Sioux Falls,SD,Minnehaha,57110,43.54321,-96.63772,...,2434.234222,17505.192460,2,4,4,4,3,4,3,3
3,4,A879973,1dec528d-eb34-4079-adce-0d7a40e82205,cd17d7b6d152cb6f23957346d11c3f07,New Richland,MN,Waseca,56072,43.89744,-93.51479,...,2127.830423,12993.437350,3,5,5,3,4,5,5,5
4,5,C544523,5885f56b-d6da-43a3-8760-83583af94266,d2f0425877b10ed6bb381f3e2579424a,West Point,VA,King William,23181,37.59894,-76.88958,...,2113.073274,3716.525786,2,1,3,3,5,3,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,B863060,a25b594d-0328-486f-a9b9-0567eb0f9723,39184dc28cc038871912ccc4500049e5,Norlina,NC,Warren,27563,36.42886,-78.23716,...,6850.942000,8927.642000,3,2,2,3,4,3,4,2
9996,9997,P712040,70711574-f7b1-4a17-b15f-48c54564b70f,3cd124ccd43147404292e883bf9ec55c,Milmay,NJ,Atlantic,8340,39.43609,-74.87302,...,7741.690000,28507.150000,3,3,4,2,5,3,4,4
9997,9998,R778890,1d79569d-8e0f-4180-a207-d67ee4527d26,41b770aeee97a5b9e7f69c906a8119d7,Southside,TN,Montgomery,37171,36.36655,-87.29988,...,8276.481000,15281.210000,3,3,3,4,4,2,3,2
9998,9999,E344109,f5a68e69-2a60-409b-a92f-ac0847b27db0,2bb491ef5b1beb1fed758cc6885c167a,Quinn,SD,Pennington,57775,44.10354,-102.01590,...,7644.483000,7781.678000,5,5,3,4,4,3,4,3


## Step 2: Keep Only Selected Variables 

In [15]:
df = df[['ReAdmis', 'Age', 'Gender', 'Income', 'HighBlood', 'Diabetes',
         'Initial_days', 'TotalCharge', 'Initial_admin', 'Complication_risk']]

## Step 3: Encode the target variable

In [17]:
df['ReAdmis'] = df['ReAdmis'].map({'Yes': 1, 'No': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ReAdmis'] = df['ReAdmis'].map({'Yes': 1, 'No': 0})


## Step 4: Encode categorical variables

In [19]:
categorical_cols = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

## Step 5: Handle missing values

In [21]:
df_encoded = df_encoded.dropna()

## Step 6: Split Features and Target

In [23]:
X = df_encoded.drop('ReAdmis', axis=1)
y = df_encoded['ReAdmis']

## Step 7: Save Cleaned Dataset

In [25]:
df_encoded.to_csv('cleaned_medical_data.csv', index=False)

# Part E Perform the data analysis and report 

## E1: Split the Data

In [29]:
from sklearn.model_selection import train_test_split

# Initial train/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save to CSV files
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_val.to_csv('y_val.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

## E2: Create an  Initial Model and Evaluate Metrics

In [33]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Train initial model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9773333333333334
Precision: 0.9709618874773139
Recall: 0.9674502712477396
F1 Score: 0.9692028985507246
AUC-ROC: 0.9983520816664789
Confusion Matrix:
 [[931  16]
 [ 18 535]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## E3. Hyperparameter Tuning with k-Fold Cross Validation

In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1]
}

grid = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

grid.fit(X_val, y_val)

print("Best Parameters:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}


## E4. Evaluate Optimized Model on Test Set

In [37]:
y_test_pred = best_model.predict(X_test)

# Metrics
acc_opt = accuracy_score(y_test, y_test_pred)
prec_opt = precision_score(y_test, y_test_pred)
rec_opt = recall_score(y_test, y_test_pred)
f1_opt = f1_score(y_test, y_test_pred)
auc_opt = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
conf_matrix_opt = confusion_matrix(y_test, y_test_pred)

print("Optimized Model Accuracy:", acc_opt)
print("Optimized Model Precision:", prec_opt)
print("Optimized Model Recall:", rec_opt)
print("Optimized Model F1 Score:", f1_opt)
print("Optimized Model AUC-ROC:", auc_opt)
print("Optimized Model Confusion Matrix:\n", conf_matrix_opt)

Optimized Model Accuracy: 0.976
Optimized Model Precision: 0.9886578449905482
Optimized Model Recall: 0.945750452079566
Optimized Model F1 Score: 0.966728280961183
Optimized Model AUC-ROC: 0.9966984347640115
Optimized Model Confusion Matrix:
 [[941   6]
 [ 30 523]]
