In [2]:
import tensorflow as tf
import tensorflow_federated as tff
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [3]:
import pickle
with open('/content/us_130_dfss.pkl', 'rb') as f:
    df= pickle.load(f)

print("DataFrame loaded:")


DataFrame loaded:


## 1. Load and preprocess dataset

In [4]:
df.columns

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'total_medications', 'service_utilization',
       'num_medications|time_in_hospital', 'num_medications|num_procedures',
       'time_in_hospital|num_lab_procedures',
       'num_medications|num_lab_procedures',
       'num_medications|number_diagnoses', 'age|number_diagnoses',
       'change|num_medications', 'number_diagnoses|time_in_hospital',
       'gender_1

In [5]:
df.loc[0]

Unnamed: 0,0
age,15
time_in_hospital,3
num_lab_procedures,59
num_procedures,0
num_medications,18
...,...
diag_1_category_Others,1
diag_1_category_Respiratory,0
AfricanAmerican,0
Caucasian,1


In [6]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

scaler = StandardScaler()
numeric_cols = ['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses','total_medications',
       'service_utilization','num_medications|time_in_hospital', 'num_medications|num_procedures',
       'time_in_hospital|num_lab_procedures',
       'num_medications|num_lab_procedures',
       'num_medications|number_diagnoses', 'age|number_diagnoses',
       'change|num_medications', 'number_diagnoses|time_in_hospital']

# Fit and transform numeric columns
scaled_data = scaler.fit_transform(df[numeric_cols])

# Compute z-scores (scaled data already has mean=0, std=1)
z_scores = np.abs(scaled_data)

# Keep rows where all features have z-score <= 3
mask = (z_scores <= 3).all(axis=1)

df = df.loc[mask].copy()
df[numeric_cols] = scaled_data[mask]

In [7]:
df.head().dtypes

Unnamed: 0,0
age,float64
time_in_hospital,float64
num_lab_procedures,float64
num_procedures,float64
num_medications,float64
...,...
diag_1_category_Others,int64
diag_1_category_Respiratory,int64
AfricanAmerican,int64
Caucasian,int64


In [8]:
df.head().T

Unnamed: 0,1,2,3,4,5
age,-2.622071,-1.98036,-1.338649,-0.696937,-0.055226
time_in_hospital,-0.784109,-0.784109,-1.123345,-0.444872,-0.105636
num_lab_procedures,-1.60206,0.051835,0.402661,-0.5997,1.354904
num_procedures,2.026788,-0.24807,-0.816784,2.595502,-0.24807
num_medications,-0.335198,0.027969,-0.940477,0.027969,0.633248
...,...,...,...,...,...
diag_1_category_Others,1,1,0,0,0
diag_1_category_Respiratory,0,0,0,0,0
AfricanAmerican,1,0,0,0,0
Caucasian,0,1,1,1,1


In [9]:
class_counts = df['readmitted'].value_counts()
class_counts

Unnamed: 0_level_0,count
readmitted,Unnamed: 1_level_1
0,53082
1,4919


In [10]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [11]:
y = df['readmitted']
X = df.drop('readmitted', axis=1)
print(X.shape)
print(y.shape)

(58001, 73)
(58001,)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)


In [13]:
logit = LogisticRegression(fit_intercept=True, penalty='l1',solver='liblinear')
logit.fit(X_train, y_train)


In [14]:
logit_pred = logit.predict(X_test)
pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(logit_pred, name = 'Predict'), margins = True)

Predict,0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1448,1448
1,139,139
All,1587,1587


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, logit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, logit_pred)))
print("Recall is {0:.2f}".format(recall_score(y_test, logit_pred)))

Accuracy is 0.92
Precision is 0.00
Recall is 0.00


In [18]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine back into one balanced DataFrame
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                         pd.Series(y_resampled, name='readmitted')], axis=1)

# Shuffle before splitting
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
df_balanced.shape

(106164, 74)