# Hospital Readmission Risk Prediction Model

Predicting 30-day hospital readmission risk using machine learning (Random Forest + SMOTE).

## 1. Setup & Data Loading

In [None]:
pip install ucimlrepo

## 2. Data Exploration

In [None]:
from ucimlrepo import fetch_ucirepo# fetch datasetdiabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296)# data (as pandas dataframes)X = diabetes_130_us_hospitals_for_years_1999_2008.data.featuresy = diabetes_130_us_hospitals_for_years_1999_2008.data.targets# metadataprint(diabetes_130_us_hospitals_for_years_1999_2008.metadata)# variable informationprint(diabetes_130_us_hospitals_for_years_1999_2008.variables)

In [None]:
import pandas as pd# 'features' contains input columns# 'targets' contains the readmission outcomedf_features = diabetes_130_us_hospitals_for_years_1999_2008.data.featuresdf_target = diabetes_130_us_hospitals_for_years_1999_2008.data.targets# Combine them into one dataframe called dfdf = pd.concat([df_features, df_target], axis=1)

In [None]:
print("✅ DataFrame successfully created!")print("Shape:", df.shape)print(df.head())

In [None]:
print(df['age'].unique()[:20])

In [None]:
import pandas as pd# Check the shapeprint(df.shape)# View sample rowsdf.head()# Overview of data types and missing valuesdf.info()# Quick summary of numeric columnsdf.describe()

## 3. Feature Engineering & Preprocessing

In [None]:
selected_cols = [    'age', 'time_in_hospital', 'num_lab_procedures', 'num_medications',    'number_outpatient', 'number_emergency', 'number_inpatient', 'readmitted']df = df[selected_cols]

In [None]:
# Convert readmitted to binary: 1 if <30 days, else 0df['readmitted_flag'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

In [None]:
def convert_age_range(age_str):    if isinstance(age_str, str):        # Remove all brackets and spaces        clean_str = age_str.strip('[]() ').replace(' ', '')        if '-' in clean_str:            low, high = clean_str.split('-')            return (int(low) + int(high)) / 2    return Nonedf['age'] = df['age'].apply(convert_age_range)df['age'] = df['age'].astype(float)

In [None]:
print(df[['age']].head())print(df['age'].describe())

In [None]:
df = df.dropna()

In [None]:
import seaborn as snsimport matplotlib.pyplot as plt

In [None]:
# 1️⃣ Check class balanceplt.figure(figsize=(5,4))sns.countplot(data=df, x='readmitted_flag', palette='viridis')plt.title('Readmission (<30 days) Distribution')plt.xlabel('Readmitted (1 = Yes, 0 = No)')plt.ylabel('Count')plt.show()

In [None]:
# 2️⃣ Quick summary statsprint(df.describe())

In [None]:
# 3️⃣ Pairplot (optional – helps visualize feature relationships)sample_cols = ['time_in_hospital', 'num_lab_procedures', 'num_medications', 'readmitted_flag']sns.pairplot(df[sample_cols], hue='readmitted_flag', palette='husl')plt.show()

In [None]:
# Drop the original 'readmitted' column before calculating correlation# Also drop the original 'age' column if 'age_numeric' existscols_to_drop = ['readmitted']if 'age_numeric' in df.columns:    cols_to_drop.append('age')df_for_corr = df.drop(columns=cols_to_drop, errors='ignore')# Convert age ranges to numerical midpoints if 'age_numeric' was not already created and 'age' is in the dataframeif 'age_numeric' not in df_for_corr.columns and 'age' in df.columns and df['age'].dtype == 'object':    age_mapping = {        '[0-10)': 5, '[10-20)': 15, '[20-30)': 25, '[30-40)': 35, '[40-50)': 45,        '[50-60)': 55, '[60-70)': 65, '[70-80)': 75, '[80-90)': 85, '[90-100)': 95    }    df_for_corr['age_numeric'] = df['age'].map(age_mapping)plt.figure(figsize=(8,6))sns.heatmap(df_for_corr.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")plt.title('Feature Correlations')plt.show()

In [None]:
# Identify categorical columnscategorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()print("Categorical columns:", categorical_cols)

In [None]:
# One-hot encode categorical columnsdf_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)# Drop the redundant one-hot encoded 'readmitted' columnsdf_encoded = df_encoded.drop(columns=['readmitted_>30', 'readmitted_NO'], errors='ignore')print("✅ Encoding complete!")print("New dataset shape:", df_encoded.shape)df_encoded.head()

In [None]:
df_encoded.info()

## 4. Exploratory Data Analysis

In [None]:
X = df_encoded.drop(columns=['readmitted_flag'])y = df_encoded['readmitted_flag']# Display the shapes of X and yprint("Shape of X:", X.shape)print("Shape of y:", y.shape)

In [None]:
import matplotlib.pyplot as pltimport seaborn as sns# Count plot of target variablesns.countplot(x='readmitted_flag', data=df)plt.title("Readmission (<30 days) Distribution")plt.xlabel("Readmitted within 30 days")plt.ylabel("Count")plt.show()# Percentage distributionclass_distribution = df['readmitted_flag'].value_counts(normalize=True) * 100print(class_distribution)

## 5. Model Training (Logistic Regression baseline)

In [None]:
from sklearn.model_selection import train_test_split# Split the data into training and testing setsX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)print("Shape of X_train:", X_train.shape)print("Shape of y_train:", y_train.shape)print("Shape of X_test:", X_test.shape)print("Shape of y_test:", y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import classification_report, confusion_matrix, accuracy_score# Logistic Regression with class_weight='balanced'log_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)log_model.fit(X_train, y_train)# Predictionsy_pred_log = log_model.predict(X_test)# Evaluationprint("🔹 Logistic Regression with Balanced Class Weights")print(classification_report(y_test, y_pred_log))print("Accuracy:", accuracy_score(y_test, y_pred_log))

## 6. Model Training (Random Forest with SMOTE)

In [None]:
smote = SMOTE(random_state=42)X_resampled, y_resampled = smote.fit_resample(X_train, y_train)print("Before SMOTE:", y_train.value_counts())print("After SMOTE:", y_resampled.value_counts())

In [None]:
from sklearn.ensemble import RandomForestClassifierrf_model = RandomForestClassifier(    n_estimators=200,    random_state=42)rf_model.fit(X_resampled, y_resampled)

In [None]:
y_pred = rf_model.predict(X_test)print(classification_report(y_test, y_pred))

## 7. Model Evaluation

In [None]:
import joblibjoblib.dump(rf_model, 'rf_readmission_smote.pkl')print("✅ New balanced model saved as rf_readmission_smote.pkl")

In [None]:
probs = rf_model.predict_proba(X_test)[:, 1]print(pd.Series(probs).describe())

In [None]:
from sklearn.metrics import ConfusionMatrixDisplayimport matplotlib.pyplot as pltfig, axes = plt.subplots(1, 2, figsize=(10, 4))ConfusionMatrixDisplay.from_estimator(log_model, X_test, y_test, ax=axes[0])axes[0].set_title("Logistic Regression (Balanced)")ConfusionMatrixDisplay.from_estimator(rf_model, X_test, y_test, ax=axes[1])axes[1].set_title("Random Forest (Balanced)")plt.tight_layout()plt.show()

In [None]:
import pandas as pdimport seaborn as snsimport matplotlib.pyplot as plt# Extract feature importancesfeature_importance = pd.Series(    rf_model.feature_importances_,    index=X_train.columns).sort_values(ascending=False)# Display top 10 important featuresplt.figure(figsize=(8,6))sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="viridis")plt.title("Top 10 Important Features for Readmission Prediction")plt.xlabel("Feature Importance Score")plt.ylabel("Feature")plt.show()

## 8. Feature Importance Analysis

In [None]:
import numpy as np# Extract feature coefficientscoeff = pd.Series(log_model.coef_[0], index=X_train.columns)coeff = coeff.sort_values(ascending=False)# Plotplt.figure(figsize=(8,6))sns.barplot(x=coeff[:10], y=coeff.index[:10], color="seagreen")plt.title("Top 10 Predictors Increasing Readmission Risk")plt.show()plt.figure(figsize=(8,6))sns.barplot(x=coeff[-10:], y=coeff.index[-10:], color="salmon")plt.title("Top 10 Predictors Decreasing Readmission Risk")plt.show()

In [None]:
import streamlit as stimport pandas as pdimport joblib# Load your trained Random Forest modelrf_model = joblib.load('rf_readmission_balanced.pkl')# App title and descriptionst.set_page_config(page_title="Patient Readmission Predictor", page_icon="🏥", layout="wide")st.title("🏥 Patient Readmission Risk Predictor")st.markdown(    """    This app predicts the likelihood of a patient being **readmitted within 30 days** based on hospital data.    Use the controls in the sidebar to input patient details and see the predicted risk instantly.    """)# Sidebar inputsst.sidebar.header("🩺 Patient Information")time_in_hospital = st.sidebar.slider("⏱️ Time in Hospital (days)", 1, 14, 5)num_lab_procedures = st.sidebar.slider("🧪 Number of Lab Procedures", 1, 100, 40)num_medications = st.sidebar.slider("💊 Number of Medications", 1, 50, 10)num_inpatient = st.sidebar.slider("🏥 Prior Inpatient Visits", 0, 10, 1)num_emergency = st.sidebar.slider("🚨 Prior Emergency Visits", 0, 10, 0)age = st.sidebar.slider("👶 Age (years)", 20, 90, 60)# Create DataFrame for model inputinput_data = pd.DataFrame({    'time_in_hospital': [time_in_hospital],    'num_lab_procedures': [num_lab_procedures],    'num_medications': [num_medications],    'number_inpatient': [num_inpatient],    'number_emergency': [num_emergency],    'number_outpatient': [0],    'age': [age]})for col in rf_model.feature_names_in_:    if col not in input_data.columns:        input_data[col] = 0  # default fillinput_data = input_data[rf_model.feature_names_in_]# Predict buttonif st.sidebar.button("🔍 Predict Readmission Risk"):    prediction = rf_model.predict(input_data)[0]    probability = rf_model.predict_proba(input_data)[0][1]    st.subheader("📊 Prediction Result")    st.markdown("---")    # Confidence bar    st.write("### Risk Probability")    st.progress(int(probability * 100))    # Display results    if prediction == 1:        st.error(f"🚨 **High Risk of Readmission!** ({probability:.1%} probability)")        st.markdown("Patients in this category may benefit from **closer post-discharge monitoring** and **early follow-up visits.**")    else:        st.success(f"✅ **Low Risk of Readmission.** ({probability:.1%} probability)")        st.markdown("This patient is **unlikely to be readmitted** in the next 30 days under current conditions.")    st.markdown("---")    st.caption("Model: Random Forest Classifier | Built by Dr. Uthman Babatunde")else:    st.info("👈 Adjust the parameters on the left and click **Predict Readmission Risk** to begin.")

In [None]:
import joblibjoblib.dump(rf_model, 'rf_readmission_balanced.pkl')