<a href="https://colab.research.google.com/github/VishalKumar196/Health-Risk-Assessment/blob/main/Health_Risk_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# ------------------------------
# Load Data
# ------------------------------
def load_data(path: str) -> pd.DataFrame:
    """
    Loads healthcare dataset from Excel or CSV file.

    Args:
        path (str): file path (.xlsx or .csv)

    Returns:
        pd.DataFrame: Loaded dataset
    """
    if path.endswith("/content/data (1).xlsx"):
        df = pd.read_excel(path)
    elif path.endswith(".csv"):
        df = pd.read_csv(path)
    else:
        raise ValueError("File format not supported. Use .xlsx or .csv")

    print(f"‚úÖ Data Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
    return df

# Example usage:
if __name__ == "__main__":
    df = load_data("/content/data (1).xlsx")
    print(df.head())  # Show first 5 rows

‚úÖ Data Loaded: 55500 rows, 15 columns
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Da

In [None]:
import pandas as pd

# ------------------------------
# Clean Data
# ------------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans healthcare dataset:
    - Fix column names
    - Handle missing values
    - Normalize text cases
    - Drop duplicates
    """
    # 1. Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace("/content/data (1).xlsx", "_")

    # 2. Remove duplicates
    df = df.drop_duplicates()

    # 3. Handle missing values
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].fillna("Unknown")  # Fill categorical with "Unknown"
        else:
            df[col] = df[col].fillna(df[col].median())  # Fill numerical with median

    # 4. Normalize text columns (like Name, Gender, Medical Condition, Test Results)
    text_cols = ["name", "gender", "blood_type", "medical_condition",
                 "admission_type", "medication", "test_results"]

    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.title()  # Title case

    print(f"‚úÖ Data Cleaned: {df.shape[0]} rows, {df.shape[1]} columns")
    return df

# Example usage
if __name__ == "__main__":
    df = load_data("/content/data (1).xlsx")
    df_clean = clean_data(df)
    print(df_clean.head())

‚úÖ Data Loaded: 55500 rows, 15 columns
‚úÖ Data Cleaned: 54966 rows, 15 columns
            name  age  gender blood type medical condition date of admission  \
0  Bobby Jackson   30    Male         B-            Cancer        2024-01-31   
1   Leslie Terry   62    Male         A+           Obesity        2019-08-20   
2    Danny Smith   76  Female         A-           Obesity        2022-09-22   
3   Andrew Watts   28  Female         O+          Diabetes        2020-11-18   
4  Adrienne Bell   43  Female        AB+            Cancer        2022-09-19   

             doctor                    hospital insurance provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   billing amount

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("Unknown")  # Fill categorical with "Unknown"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].median())  # Fill numerical with median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].median())  # Fill numerical 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# ------------------------------
# üéØ Define Target
# ------------------------------
target_col = "test results"   # Risk assessment target
X = df.drop(columns=[target_col, "name", "doctor", "hospital", "discharge date"])
y = df[target_col]

# ------------------------------
# Encode target (Normal/Abnormal ‚Üí numbers)
# ------------------------------
le_target = LabelEncoder()
y = le_target.fit_transform(y)

# ------------------------------
# Encode categorical features
# ------------------------------
categorical_cols = X.select_dtypes(include="object").columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# ------------------------------
# Scale numeric columns
# ------------------------------
scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=np.number).columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# ------------------------------
# Train-Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Data prepared:")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


‚úÖ Data prepared:
Train shape: (44400, 10)
Test shape: (11100, 10)


In [None]:
# Clean column names (remove spaces, make consistent)
df.columns = df.columns.str.strip()   # remove leading/trailing spaces
df.columns = df.columns.str.replace(" ", "_")  # replace spaces with underscores

print(df.columns)  # check available columns


Index(['name', 'age', 'gender', 'blood_type', 'medical_condition',
       'date_of_admission', 'doctor', 'hospital', 'insurance_provider',
       'billing_amount', 'room_number', 'admission_type', 'discharge_date',
       'medication', 'test_results'],
      dtype='object')


In [None]:
# =========================================
# üìå Health Risk Assessment - ML Pipeline
# =========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
import xgboost as xgb
import joblib
import os

# ------------------------------
# 1. Load dataset
# ------------------------------
df = pd.read_excel("/content/data (1).xlsx")  # ‚úÖ Use read_excel instead of read_csv for .xlsx files

print("‚úÖ Data Loaded:", df.shape)
print("üìå Original Columns:", df.columns.tolist())

# ------------------------------
# 2. Standardize column names
# ------------------------------
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("‚úÖ Renamed Columns:", df.columns.tolist())

# ------------------------------
# 3. Feature Engineering
# ------------------------------

# Convert dates
df["date_of_admission"] = pd.to_datetime(df["date_of_admission"], errors="coerce")
df["discharge_date"] = pd.to_datetime(df["discharge_date"], errors="coerce")

# Extract useful features
df["admission_year"] = df["date_of_admission"].dt.year
df["admission_month"] = df["date_of_admission"].dt.month
df["admission_day"] = df["date_of_admission"].dt.day
df["discharge_year"] = df["discharge_date"].dt.year
df["discharge_month"] = df["discharge_date"].dt.month
df["discharge_day"] = df["discharge_date"].dt.day

# Length of stay
df["stay_length"] = (df["discharge_date"] - df["date_of_admission"]).dt.days

# Drop unnecessary columns
drop_cols = ["name", "date_of_admission", "discharge_date"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# ------------------------------
# 4. Encode categorical features
# ------------------------------
cat_cols = df.select_dtypes(include=["object"]).columns

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

# ------------------------------
# 5. Encode target (test_results)
# ------------------------------
if "test_results" not in df.columns:
    raise ValueError("‚ùå 'test_results' column not found in dataset!")

target_encoder = LabelEncoder()
df["test_results"] = target_encoder.fit_transform(df["test_results"])

# Features (X) and Target (y)
X = df.drop("test_results", axis=1)
y = df["test_results"]

print("‚úÖ Features:", X.shape, " Target:", y.shape)

# ------------------------------
# 6. Train-test split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Train shape:", X_train.shape, " Test shape:", X_test.shape)

# ------------------------------
# 7. Train XGBoost Model
# ------------------------------
model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)

# ------------------------------
# 8. Evaluate Model
# ------------------------------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

print("‚úÖ Accuracy:", accuracy_score(y_test, y_pred))
print("‚úÖ F1 Score:", f1_score(y_test, y_pred, average="weighted"))
print("‚úÖ Classification Report:\n", classification_report(y_test, y_pred))

try:
    roc_score = roc_auc_score(y_test, y_prob, multi_class="ovr")
    print("‚úÖ ROC-AUC Score:", roc_score)
except Exception as e:
    print("‚ö†Ô∏è ROC-AUC not available:", e)

# ------------------------------
# 9. Save Model & Encoders
# ------------------------------
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/health_risk_model.pkl")
joblib.dump(encoders, "model/feature_encoders.pkl")
joblib.dump(target_encoder, "model/target_encoder.pkl")

print("‚úÖ Model and encoders saved successfully!")


‚úÖ Data Loaded: (55500, 15)
üìå Original Columns: ['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date', 'Medication', 'Test Results']
‚úÖ Renamed Columns: ['name', 'age', 'gender', 'blood_type', 'medical_condition', 'date_of_admission', 'doctor', 'hospital', 'insurance_provider', 'billing_amount', 'room_number', 'admission_type', 'discharge_date', 'medication', 'test_results']
‚úÖ Features: (55500, 18)  Target: (55500,)
‚úÖ Train shape: (44400, 18)  Test shape: (11100, 18)
‚úÖ Accuracy: 0.4137837837837838
‚úÖ F1 Score: 0.4137075863541208
‚úÖ Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.42      0.42      3726
           1       0.41      0.42      0.42      3671
           2       0.41      0.40      0.41      3703

    accuracy                           0.41     11100
   macro avg 