In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,
                              AdaBoostRegressor, HistGradientBoostingRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

In [4]:
visit = pd.read_csv('visit.csv')
risk = pd.read_csv('risk.csv')
care = pd.read_csv('care.csv')
diagnosis = pd.read_csv('diagnosis.csv')
patient = pd.read_csv('patient.csv')

In [5]:
df = patient.copy()

In [6]:
df['patient_id'].duplicated().sum()

np.int64(0)

CARE TABLE


In [7]:
# Ensure date columns are datetime objects
date_columns = ['visit_start_dt', 'visit_end_dt', 'follow_up_dt']
for col in date_columns:
    if col in visit.columns:
        visit[col] = pd.to_datetime(visit[col], errors='coerce')

# Find the latest date across all relevant columns
latest_date_in_visit = visit[date_columns].max().max()

print(f"The latest date in the visit data is: {latest_date_in_visit}")
# Convert 'hot_spotter_identified_at' to datetime objects
df['hot_spotter_identified_at'] = pd.to_datetime(df['hot_spotter_identified_at'], errors='coerce')

# Calculate the time difference in days since the hotspot was identified
# Use the latest_date_in_visit found previously as the reference
df['time_since_hotspot_identified'] = (latest_date_in_visit - df['hot_spotter_identified_at']).dt.days

df.drop('hot_spotter_identified_at', axis=1 ,inplace=True )

The latest date in the visit data is: 2025-03-06 00:00:00


In [8]:
# Combine msrmnt_type and msrmnt_sub_type in the care table
care['msrmnt_type_subtype'] = care['msrmnt_type'] + '_' + care['msrmnt_sub_type']

# Group by patient_id and aggregate the combined string, handling potential NaNs
care_aggregated = care.groupby('patient_id')['msrmnt_type_subtype'].apply(lambda x: '_'.join(x.dropna().unique())).reset_index()

# Merge the aggregated care data with the df DataFrame
df = pd.merge(df, care_aggregated, on='patient_id', how='left')

# Display the updated df DataFrame
display(df.head())

Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype
0,291,56,f,f,5.0,
1,306,42,f,f,,
2,310,71,f,f,5.0,
3,315,49,f,f,,
4,318,25,f,f,,


In [9]:
# Group care by patient_id and aggregate care_gap_ind
care_gap_aggregated = care.groupby('patient_id')['care_gap_ind'].apply(lambda x: '_'.join(x.dropna().unique())).reset_index()

# Merge with df
df = pd.merge(df, care_gap_aggregated, on='patient_id', how='left')

# Display the updated df DataFrame
display(df.head())

Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind
0,291,56,f,f,5.0,,
1,306,42,f,f,,,
2,310,71,f,f,5.0,,
3,315,49,f,f,,,
4,318,25,f,f,,,


Diagnosis Table

In [10]:
# Get unique condition names from the diagnosis table
unique_conditions = diagnosis['condition_name'].unique()

# Create a new DataFrame with patient_id and a column of ones
diagnosis_binary = diagnosis[['patient_id', 'condition_name']].copy()
diagnosis_binary['has_condition'] = 1

# Pivot the table to get unique conditions as columns
diagnosis_pivot = diagnosis_binary.pivot_table(
    index='patient_id',
    columns='condition_name',
    values='has_condition',
    fill_value=0
).reset_index()

# Merge the new binary columns with the df DataFrame
df = pd.merge(df, diagnosis_pivot, on='patient_id', how='left')

# Fill NaN values (for patients not in diagnosis) with 0
for condition in unique_conditions:
    df[condition] = df[condition].fillna(0)

# ---- NEW CODE: Calculate chronic_ratio ----
df['chronic_ratio'] = df[unique_conditions].sum(axis=1) / len(unique_conditions)

# Display updated DataFrame
display(df.head())


Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,chronic_ratio
0,291,56,f,f,5.0,,,0.0,1.0,1.0,0.666667
1,306,42,f,f,,,,0.0,0.0,0.0,0.0
2,310,71,f,f,5.0,,,0.0,1.0,1.0,0.666667
3,315,49,f,f,,,,0.0,0.0,0.0,0.0
4,318,25,f,f,,,,0.0,0.0,0.0,0.0


VISIT TABLE

In [11]:
# --- Unique diagnosis values (drop NaN early)
unique_visit_diag = visit['prncpl_diag_nm'].dropna().unique()

# --- Build patient × diagnosis multi-hot (presence = 1)
visit_binary = (
    visit[['patient_id', 'prncpl_diag_nm']]
      .dropna(subset=['prncpl_diag_nm'])
      .assign(has_diag=1)
)

visit_pivot = (
    visit_binary
      .pivot_table(
          index='patient_id',
          columns='prncpl_diag_nm',
          values='has_diag',
          aggfunc='max',        # presence if ever seen
          fill_value=0
      )
      .rename_axis(None, axis=1)
)

# Optional: add a prefix to avoid collisions
visit_pivot = visit_pivot.add_prefix('dx_')

# Make it small: store as uint8 (or bool)
visit_pivot = visit_pivot.astype('uint8')

# --- Merge into your main df on patient_id
df = df.merge(visit_pivot.reset_index(), on='patient_id', how='left')

# Columns that were absent for some patients will be NaN after merge; fill them:
dx_cols = [c for c in df.columns if c.startswith('dx_')]
df[dx_cols] = df[dx_cols].fillna(0).astype('uint8')

# --- Ratios / totals based on distinct diagnoses present
df['visit_diag_ratio'] = df[dx_cols].sum(axis=1) / len(dx_cols)
df['total_visit_diagnoses'] = df[dx_cols].sum(axis=1)

# peek
display(df.head())


Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,"dx_Wedge compression fracture of T11-T12 vertebra, initial encounter for closed fracture","dx_Wedge compression fracture of first lumbar vertebra, initial encounter for closed fracture","dx_Wedge compression fracture of third lumbar vertebra, initial encounter for closed fracture",dx_Wheezing,dx_Xerosis cutis,dx_Zoster with other complications,dx_Zoster without complications,"dx_Zygomatic fracture, right side, initial encounter for closed fracture",visit_diag_ratio,total_visit_diagnoses
0,291,56,f,f,5.0,,,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0.0,0
1,306,42,f,f,,,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0
2,310,71,f,f,5.0,,,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0.000699,1
3,315,49,f,f,,,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0
4,318,25,f,f,,,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.002098,3


In [12]:
import pandas as pd
import re

# -----------------------------
# 1) BINNING RULES
# -----------------------------
rules = [
    ('Respiratory Infection', r'infection|pneumonia|bronchitis|pharyngitis|laryngitis|tracheitis|tonsillitis|sinusitis|nasopharyngitis|flu|influenza|covid'),
    ('Lower Respiratory', r'pneumonia|bronchitis|bronchiolitis|wheezing|asthma|copd'),
    ('Upper Respiratory', r'cough|cold|throat|upper respiratory|pharyngitis|laryngitis'),
    ('ENT', r'otitis|ear|sinus|sinusitis|epistaxis|rhinitis|cerumen|tonsil|throat|pharynx|larynx'),
    ('Musculoskeletal', r'strain|sprain|fracture|contusion|myalgia|arthritis|back pain|shoulder|wrist|knee|hip|ligament|muscle|joint|tendon|osteo|sciatica'),
    ('Injury/Wound', r'injury|wound|laceration|open wound|foreign body|bite|burn|abrasion|crush|trauma|contusion|dislocation|fracture|amputation'),
    ('Skin/Soft Tissue', r'cellulitis|abscess|furuncle|erythematous|dermatitis|rash|swelling|urticaria|cyst|ulcer|bite|laceration|wound|pruritus'),
    ('GU', r'cystitis|urinary|bladder|hematuria|pyelonephritis|prostatitis|incontinence|urethritis'),
    ('GI', r'abdominal|gastro|nausea|vomiting|epigastric|colitis|diarrhea|constipation|appendicitis|gastritis|pancreatitis|hepatitis|hernia|peritonitis|cholecystitis|gallbladder|bleeding|hemorrhoids|gerd|reflux|diverticulitis'),
    ('Neurology/Psych', r'headache|migraine|dizz|giddiness|vertigo|syncope|collapse|seizure|epilepsy|paralysis|stroke|tremor|neuro|disorder|depression|anxiety|mood|insomnia|sleep'),
    ('Cardiovascular', r'chest pain|palpitations|hypertension|tachy|arrhythmia|heart failure|infarction|angina|embolism|thrombosis|atherosclerosis|hypotension|stemi|nstemi'),
    ('Obstetric/Gyne', r'pregnancy|labor|childbirth|preterm|vaginitis|menstruation|miscarriage|abortion|perineal|postpartum|uterovaginal|ovarian|endometriosis|fetal|maternal care|gestational'),
    ('Endocrine/Metabolic', r'diabetes|thyroid|metabolic|nutritional|obesity|hypoglycemia|ketoacidosis|electrolyte|hypokalemia|hyperglycemia'),
    ('Eye', r'conjunctivitis|hordeolum|chalazion|stye|blepharitis|cataract|glaucoma|corneal|iridocyclitis|retinal'),
    ('Other Infection', r'viral|bacterial|abscess|sepsis|tuberculosis|mononucleosis|infection'),
    ('Allergy/Immune', r'allergy|urticaria|anaphylaxis|angioedema|immune|contact dermatitis'),
    ('Pain', r'pain'),
    ('Other', r'fever|malaise|fatigue|unspecified|other|abnormal|screening|observation|follow-up'),
]

def bin_diagnosis(text):
    if pd.isnull(text):
        return 'Other'
    t = text.lower()
    for cat, pat in rules:
        if re.search(pat, t):
            return cat
    return 'Other'

# -----------------------------
# 2) APPLY BINNING
# -----------------------------
visit['diag_bin'] = visit['prncpl_diag_nm'].apply(bin_diagnosis)

# -----------------------------
# 3) One-hot diag bins + visit types
# -----------------------------
diag_dummies = pd.get_dummies(visit['diag_bin'], prefix='diag', dtype=int)
visit_type_dummies = pd.get_dummies(visit['visit_type'], prefix='visit_type', dtype=int)

# -----------------------------
# 4) readmission mapping
# -----------------------------
def encode_flag(value):
    if pd.isna(value):
        return 0  # or -1 if you prefer
    return int(bool(value))

visit['readmsn_ind'] = visit['readmsn_ind'].apply(encode_flag)

# -----------------------------
# 5) Combine + aggregate
# -----------------------------
df_for_agg = pd.concat([visit[['patient_id', 'readmsn_ind']], visit_type_dummies, diag_dummies], axis=1)

# dynamic aggregations
aggregations = {
    **{c: (c, 'sum') for c in visit_type_dummies.columns},
    **{c: (c, 'sum') for c in diag_dummies.columns},
    'visit_count': ('readmsn_ind', 'count'),
    'total_readmissions': ('readmsn_ind', 'sum')
}

patient_summary = df_for_agg.groupby('patient_id').agg(**aggregations).reset_index()

# convert visit type to indicator
for c in visit_type_dummies.columns:
    patient_summary[c] = (patient_summary[c] > 0).astype(int)


In [13]:
df = df.merge(patient_summary, on='patient_id', how='left')

# -----------------------------
# ✅ FIX: fill NaNs with 0
# -----------------------------
fill_cols = [c for c in patient_summary.columns if c != 'patient_id']
df[fill_cols] = df[fill_cols].fillna(0).astype(int)


In [14]:
df.head()

Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,diag_Neurology/Psych,diag_Obstetric/Gyne,diag_Other,diag_Other Infection,diag_Pain,diag_Respiratory Infection,diag_Skin/Soft Tissue,diag_Upper Respiratory,visit_count,total_readmissions
0,291,56,f,f,5.0,,,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,306,42,f,f,,,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,310,71,f,f,5.0,,,0.0,1.0,1.0,...,0,0,0,0,0,1,0,0,1,1
3,315,49,f,f,,,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,318,25,f,f,,,,0.0,0.0,0.0,...,0,2,1,0,0,0,0,0,3,3


In [15]:
# build pivot as you already do ...
visit_type_binary = visit[['patient_id', 'visit_type']].copy()
visit_type_binary['has_visit_type'] = 1

visit_type_pivot = visit_type_binary.pivot_table(
    index='patient_id',
    columns='visit_type',
    values='has_visit_type',
    fill_value=0
).reset_index()

# prefix pivot columns
visit_type_cols_raw = [c for c in visit_type_pivot.columns if c != 'patient_id']
visit_type_pivot = visit_type_pivot.rename(columns={c: f"visit_type_{c}" for c in visit_type_cols_raw})
visit_type_cols = [f"visit_type_{c}" for c in visit_type_cols_raw]

# --- BEFORE MERGING: drop any old versions to avoid _x/_y suffixes ---
df = df.drop(columns=visit_type_cols + [
    'total_visits', 'min_diff_days', 'max_diff_days', 'mean_diff_days'
], errors='ignore')

# merges
df = df.merge(visit_type_pivot, on='patient_id', how='left')
df = df.merge(visit.groupby('patient_id').size().reset_index(name='total_visits'),
              on='patient_id', how='left')

visit = visit.copy()
for col in ['visit_end_dt', 'follow_up_dt']:
    if col in visit.columns:
        visit[col] = pd.to_datetime(visit[col], errors='coerce')
visit['diff_days'] = (
    visit['follow_up_dt'] - visit['visit_end_dt']
).dt.days if {'visit_end_dt','follow_up_dt'}.issubset(visit.columns) else pd.NA
diff_stats = visit.groupby('patient_id').agg(
    min_diff_days=('diff_days','min'),
    max_diff_days=('diff_days','max'),
    mean_diff_days=('diff_days','mean')
).reset_index()
df = df.merge(diff_stats, on='patient_id', how='left')

# fill/create visit_type_* columns safely
for col in visit_type_cols:
    if col not in df.columns:
        df[col] = 0
    else:
        df[col] = df[col].fillna(0)

df['total_visits'] = df['total_visits'].fillna(0).astype(int)

# ratio
num_visit_types = max(len(visit_type_cols), 1)
df['visit_type_ratio'] = df[visit_type_cols].sum(axis=1) / num_visit_types


In [16]:
# --- Count readmission 't' and 'f' per patient ---
readmission_stats = (
    visit.groupby(['patient_id', 'readmsn_ind'])
    .size()
    .unstack(fill_value=0)  # creates columns 'f' and 't'
    .reset_index()
)

# Ensure both columns exist (some patients may have only 't' or only 'f')
for col in ['t', 'f']:
    if col not in readmission_stats.columns:
        readmission_stats[col] = 0

# --- Merge only counts into df ---
df = df.merge(
    readmission_stats[['patient_id', 't', 'f']],
    on='patient_id',
    how='left'
)

# Rename columns for clarity
df = df.rename(columns={
    't': 'readmission_true_count',
    'f': 'readmission_false_count'
})

# Fill missing values with 0
df[['readmission_true_count', 'readmission_false_count']] = (
    df[['readmission_true_count', 'readmission_false_count']].fillna(0)
)

# --- Display updated DataFrame ---
display(df.head())


Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,visit_type_ER,visit_type_INPATIENT,visit_type_URGENT CARE,total_visits,min_diff_days,max_diff_days,mean_diff_days,visit_type_ratio,readmission_true_count,readmission_false_count
0,291,56,f,f,5.0,,,0.0,1.0,1.0,...,0.0,0.0,0.0,0,,,,0.0,0.0,0.0
1,306,42,f,f,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0,,,,0.0,0.0,0.0
2,310,71,f,f,5.0,,,0.0,1.0,1.0,...,0.0,1.0,0.0,1,6.0,6.0,6.0,0.333333,0.0,0.0
3,315,49,f,f,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0,,,,0.0,0.0,0.0
4,318,25,f,f,,,,0.0,0.0,0.0,...,1.0,1.0,1.0,3,4.0,4.0,4.0,1.0,0.0,0.0


In [17]:
# --- Count total follow_up_date entries per patient ---
follow_up_counts = (
    visit.groupby('patient_id')['follow_up_dt']
    .count()  # counts non-null follow_up_date values
    .reset_index(name='total_followups')
)

# --- Merge this count into df --
df = df.merge(
    follow_up_counts,
    on='patient_id',
    how='left'
)

# --- Fill missing values with 0 (for patients with no follow-ups) ---
df['total_followups'] = df['total_followups'].fillna(0).astype(int)

# --- Display updated DataFrame ---
display(df.head())


Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,visit_type_INPATIENT,visit_type_URGENT CARE,total_visits,min_diff_days,max_diff_days,mean_diff_days,visit_type_ratio,readmission_true_count,readmission_false_count,total_followups
0,291,56,f,f,5.0,,,0.0,1.0,1.0,...,0.0,0.0,0,,,,0.0,0.0,0.0,0
1,306,42,f,f,,,,0.0,0.0,0.0,...,0.0,0.0,0,,,,0.0,0.0,0.0,0
2,310,71,f,f,5.0,,,0.0,1.0,1.0,...,1.0,0.0,1,6.0,6.0,6.0,0.333333,0.0,0.0,1
3,315,49,f,f,,,,0.0,0.0,0.0,...,0.0,0.0,0,,,,0.0,0.0,0.0,0
4,318,25,f,f,,,,0.0,0.0,0.0,...,1.0,1.0,3,4.0,4.0,4.0,1.0,0.0,0.0,1


In [18]:
df['follow_up_ratio'] = df['total_followups']/df['total_visits']

In [19]:
df.columns

Index(['patient_id', 'age', 'hot_spotter_readmission_flag',
       'hot_spotter_chronic_flag', 'time_since_hotspot_identified',
       'msrmnt_type_subtype', 'care_gap_ind', 'CANCER', 'DIABETES',
       'HYPERTENSION',
       ...
       'visit_type_URGENT CARE', 'total_visits', 'min_diff_days',
       'max_diff_days', 'mean_diff_days', 'visit_type_ratio',
       'readmission_true_count', 'readmission_false_count', 'total_followups',
       'follow_up_ratio'],
      dtype='object', length=1475)

In [20]:
df

Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,visit_type_URGENT CARE,total_visits,min_diff_days,max_diff_days,mean_diff_days,visit_type_ratio,readmission_true_count,readmission_false_count,total_followups,follow_up_ratio
0,291,56,f,f,5.0,,,0.0,1.0,1.0,...,0.0,0,,,,0.000000,0.0,0.0,0,
1,306,42,f,f,,,,0.0,0.0,0.0,...,0.0,0,,,,0.000000,0.0,0.0,0,
2,310,71,f,f,5.0,,,0.0,1.0,1.0,...,0.0,1,6.0,6.0,6.0,0.333333,0.0,0.0,1,1.000000
3,315,49,f,f,,,,0.0,0.0,0.0,...,0.0,0,,,,0.000000,0.0,0.0,0,
4,318,25,f,f,,,,0.0,0.0,0.0,...,1.0,3,4.0,4.0,4.0,1.000000,0.0,0.0,1,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,82117,45,f,f,,,,0.0,0.0,0.0,...,1.0,1,,,,0.333333,0.0,0.0,0,0.000000
7996,82120,12,f,f,,,,0.0,0.0,0.0,...,0.0,0,,,,0.000000,0.0,0.0,0,
7997,82130,49,f,f,,,,0.0,0.0,0.0,...,1.0,1,,,,0.333333,0.0,0.0,0,0.000000
7998,82152,54,f,f,,,,0.0,0.0,0.0,...,0.0,0,,,,0.000000,0.0,0.0,0,


In [21]:
# Check for null values in each column and display columns with NaNs
nan_columns = df.columns[df.isnull().any()].tolist()
print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
['time_since_hotspot_identified', 'msrmnt_type_subtype', 'care_gap_ind', 'min_diff_days', 'max_diff_days', 'mean_diff_days', 'follow_up_ratio']


In [22]:
# Identify the one-hot encoded visit type columns
visit_type_cols = [col for col in df.columns if col.startswith('visit_type_')]

# Calculate the weighted visits for each visit type
for col in visit_type_cols:
    df[f'{col}_weighted'] = df[col] * df['total_visits']

# Create a single column with the sum of weighted visits for each patient
weighted_visit_cols = [f'{col}_weighted' for col in visit_type_cols]
df['weighted_visits'] = df[weighted_visit_cols].sum(axis=1)

# Display the updated DataFrame with the new weighted columns and the total weighted_visits
display(df[['patient_id', 'total_visits'] + visit_type_cols + weighted_visit_cols + ['weighted_visits']].head())

Unnamed: 0,patient_id,total_visits,visit_type_ER,visit_type_INPATIENT,visit_type_URGENT CARE,visit_type_ratio,visit_type_ER_weighted,visit_type_INPATIENT_weighted,visit_type_URGENT CARE_weighted,visit_type_ratio_weighted,weighted_visits
0,291,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,306,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,310,1,0.0,1.0,0.0,0.333333,0.0,1.0,0.0,0.333333,1.333333
3,315,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,318,3,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,12.0


In [23]:
def handle_missing_values(df):
    """
    Intelligently handle missing values based on feature type and medical context
    """
    # Create flags for missing data (often meaningful in medical context)
    df['has_care_data'] = df['msrmnt_type_subtype'].notna().astype(int)
    df['has_care_gap_data'] = df['care_gap_ind'].notna().astype(int)

    # For numerical features, use median instead of 0
    numerical_cols = ['min_diff_days', 'max_diff_days', 'mean_diff_days',
                      'time_since_hotspot_identified', 'weighted_visits']
    for col in numerical_cols:
        if col in df.columns:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"  ✓ Filled {col} with median: {median_val:.2f}")

    # For follow_up_ratio, 0 is meaningful (no follow-up)
    if 'follow_up_ratio' in df.columns:
        df['follow_up_ratio'] = df['follow_up_ratio'].fillna(0)
        print(f"  ✓ Filled follow_up_ratio with 0")

    # Categorical - keep as is but add meaningful label
    if 'msrmnt_type_subtype' in df.columns:
        df['msrmnt_type_subtype'] = df['msrmnt_type_subtype'].fillna('no_screening')

    if 'care_gap_ind' in df.columns:
        df['care_gap_ind'] = df['care_gap_ind'].fillna('no_data')

    print("✓ Missing values handled")
    return df

def create_medical_features(df):
    """
    Create clinically meaningful features based on medical knowledge
    """
    print("Creating medical features...")

    # 1. Comorbidity score (weighted by severity)
    if all(col in df.columns for col in ['DIABETES', 'HYPERTENSION', 'CANCER']):
        df['comorbidity_score'] = (
            df['DIABETES'] * 1 +      # Moderate severity
            df['HYPERTENSION'] * 1 +  # Moderate severity
            df['CANCER'] * 3         # High severity
        )
        print("  ✓ Comorbidity score created")

    # 2. Healthcare utilization intensity
    if all(col in df.columns for col in ['visit_type_URGENTCARE', 'visit_type_ER', 'visit_type_INPATIENT']):
        df['utilization_intensity'] = (
            df['visit_type_URGENTCARE']*2 +
            df['visit_type__ER'] * 3 +       # ER visits weighted heavily
            df['visit_type__INPATIENT'] * 5  # Inpatient visits significant
        )
        print("  ✓ Utilization intensity created")

    # 3. Patient engagement score
    if all(col in df.columns for col in ['total_followups', 'follow_up_ratio']):
        df['patient_engagement'] = (
            (df['total_followups'] > 0).astype(int) +
            (df['follow_up_ratio'] > 0.5).astype(int) +
            (df['care_gap_ind'] != 'no_data').astype(int)
        )
        print("  ✓ Patient engagement score created")

    # 4. Age risk categories (standard geriatric categories)
    if 'age' in df.columns:
        df['age_risk_category'] = pd.cut(
            df['age'],
            bins=[0, 18, 45, 65, 100],
            labels=['pediatric', 'adult', 'senior', 'geriatric']
        )
        # Convert to dummy variables
        age_dummies = pd.get_dummies(df['age_risk_category'], prefix='age_cat')
        df = pd.concat([df, age_dummies], axis=1)
        df = df.drop('age_risk_category', axis=1)
        print("  ✓ Age risk categories created")

    # 5. Acute vs Chronic care ratio
    if all(col in df.columns for col in ['total_vist_actue', 'chronic_ratio']):
        df['acute_chronic_ratio'] = df['total_vist_actue'] / (df['chronic_ratio'] + 0.01)
        print("  ✓ Acute/chronic ratio created")

    # 6. Care continuity (inverse of days between visits)
    if 'mean_diff_days' in df.columns:
        df['care_continuity_score'] = np.where(
            df['mean_diff_days'] > 0,
            1 / (df['mean_diff_days'] + 1),
            0
        )
        print("  ✓ Care continuity score created")

    # 7. High-risk flags
    if 'age' in df.columns:
        df['is_high_risk_age'] = (df['age'] >= 65).astype(int)
        print("  ✓ High-risk age flag created")

    if 'comorbidity_score' in df.columns:
        df['is_high_comorbidity'] = (df['comorbidity_score'] >= 2).astype(int)
        print("  ✓ High comorbidity flag created")

    print("✓ All medical features created")
    return df


def stratify_risk(predictions, percentiles=[50, 75, 90]):
    """
    Categorize patients into actionable risk tiers
    """
    thresholds = np.percentile(predictions, percentiles)

    def assign_risk(score):
        if score < thresholds[0]:
            return 'Low Risk'
        elif score < thresholds[1]:
            return 'Moderate Risk'
        elif score < thresholds[2]:
            return 'High Risk'
        else:
            return 'Critical Risk'

    return np.array([assign_risk(s) for s in predictions])

print("✓ Utility functions defined")


✓ Utility functions defined


In [24]:
df = handle_missing_values(df)
df = create_medical_features(df)



  ✓ Filled min_diff_days with median: 5.00
  ✓ Filled max_diff_days with median: 6.00
  ✓ Filled mean_diff_days with median: 6.00
  ✓ Filled time_since_hotspot_identified with median: 33.00
  ✓ Filled weighted_visits with median: 0.00
  ✓ Filled follow_up_ratio with 0
✓ Missing values handled
Creating medical features...
  ✓ Comorbidity score created
  ✓ Patient engagement score created
  ✓ Age risk categories created
  ✓ Care continuity score created
  ✓ High-risk age flag created
  ✓ High comorbidity flag created
✓ All medical features created


In [25]:
# Drop rows with any NaN values
df = df.dropna(axis=1)

# Check for null values in each column after dropping NaNs and display columns with NaNs
nan_columns = df.columns[df.isnull().any()].tolist()
print("Columns with NaN values after dropping rows:")
print(nan_columns)

# Display the updated DataFrame head
display(df.head())

Columns with NaN values after dropping rows:
[]


Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,has_care_gap_data,comorbidity_score,patient_engagement,age_cat_pediatric,age_cat_adult,age_cat_senior,age_cat_geriatric,care_continuity_score,is_high_risk_age,is_high_comorbidity
0,291,56,f,f,5.0,no_screening,no_data,0.0,1.0,1.0,...,0,2.0,0,False,False,True,False,0.142857,0,1
1,306,42,f,f,33.0,no_screening,no_data,0.0,0.0,0.0,...,0,0.0,0,False,True,False,False,0.142857,0,0
2,310,71,f,f,5.0,no_screening,no_data,0.0,1.0,1.0,...,0,2.0,2,False,False,False,True,0.142857,1,1
3,315,49,f,f,33.0,no_screening,no_data,0.0,0.0,0.0,...,0,0.0,0,False,False,True,False,0.142857,0,0
4,318,25,f,f,33.0,no_screening,no_data,0.0,0.0,0.0,...,0,0.0,1,False,True,False,False,0.2,0,0


In [26]:
import pandas as pd
# Merge data and risk on patient_id
data = pd.merge(df, risk, on='patient_id', how='left')

# Display the updated data DataFrame
display(data.head())

Unnamed: 0,patient_id,age,hot_spotter_readmission_flag,hot_spotter_chronic_flag,time_since_hotspot_identified,msrmnt_type_subtype,care_gap_ind,CANCER,DIABETES,HYPERTENSION,...,comorbidity_score,patient_engagement,age_cat_pediatric,age_cat_adult,age_cat_senior,age_cat_geriatric,care_continuity_score,is_high_risk_age,is_high_comorbidity,risk_score
0,291,56,f,f,5.0,no_screening,no_data,0.0,1.0,1.0,...,2.0,0,False,False,True,False,0.142857,0,1,0.51
1,306,42,f,f,33.0,no_screening,no_data,0.0,0.0,0.0,...,0.0,0,False,True,False,False,0.142857,0,0,0.61
2,310,71,f,f,5.0,no_screening,no_data,0.0,1.0,1.0,...,2.0,2,False,False,False,True,0.142857,1,1,11.7
3,315,49,f,f,33.0,no_screening,no_data,0.0,0.0,0.0,...,0.0,0,False,False,True,False,0.142857,0,0,0.65
4,318,25,f,f,33.0,no_screening,no_data,0.0,0.0,0.0,...,0.0,1,False,True,False,False,0.2,0,0,1.08


In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # --- Your existing code ---
# categorical_cols = data.select_dtypes(include=['object']).columns

# # Apply one-hot encoding to categorical columns
# data_encoded = pd.get_dummies(data, columns=categorical_cols, dummy_na=False)

# # Calculate correlation matrix (Spearman)
# corr_matrix = data_encoded.corr(method='spearman', numeric_only=True)

# # Filter for correlations with risk_score above threshold
# filtered_correlations = corr_matrix['risk_score'][abs(corr_matrix['risk_score']) >= 0.1].sort_values(ascending=False)
# display(filtered_correlations)

# # --- Heatmap for full correlation matrix (optional) ---
# top_features = filtered_correlations.index.tolist()

# plt.figure(figsize=(len(top_features) * 0.6, len(top_features) * 0.6))  # auto-size by feature count

# sns.heatmap(
#     corr_matrix.loc[top_features, top_features],
#     cmap='coolwarm',
#     center=0,
#     annot=True,
#     fmt=".2f",
#     annot_kws={"size": 8},  # smaller font
#     square=True,
#     cbar_kws={'shrink': 0.6}
# )

# plt.title('Heatmap of Top Correlated Features with Risk Score', fontsize=12, pad=12)
# plt.xticks(rotation=45, ha='right', fontsize=9)
# plt.yticks(rotation=0, fontsize=9)
# plt.tight_layout()
# plt.show()

In [1]:
# # Get the list of columns from the filtered_correlations index
# columns_to_keep = filtered_correlations.index.tolist()

# # Ensure 'patient_id' and the target variable 'risk_score' are included
# if 'patient_id' not in columns_to_keep:
#     columns_to_keep.append('patient_id')
# if 'risk_score' not in columns_to_keep:
#     columns_to_keep.append('risk_score')


# # Filter the original data_encoded DataFrame to keep only the selected columns
# df_filtered = data_encoded[columns_to_keep]

# # Display the head of the filtered DataFrame
# display(df_filtered.head())

NameError: name 'filtered_correlations' is not defined

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Columns: 1492 entries, patient_id to risk_score
dtypes: bool(4), float64(24), int64(29), object(4), uint64(1), uint8(1430)
memory usage: 14.5+ MB


In [37]:
cols_to_keep = [
    "risk_score",
    "patient_id",
    "age",
    "comorbidity_score",
    "chronic_ratio",
    "HYPERTENSION",
    "age_cat_senior",
    "is_high_comorbidity",
    "DIABETES",
    "patient_engagement",
    "is_high_risk_age",
    "visit_type_ER",
    "weighted_visits",
    "visit_type_ratio",
    "visit_diag_ratio",
    "total_visits",
    "visit_count",
    "total_readmissions",
    "age_cat_geriatric",
    "total_followups",
    "follow_up_ratio",
    "visit_type_INPATIENT",
    "CANCER",
    "diag_Other",
    "diag_Neurology/Psych",
    "diag_Cardiovascular",
    "diag_GI",
    "diag_Skin/Soft Tissue",
    "age_cat_adult",
    "age_cat_pediatric"
]


In [38]:
df_filtered = data[cols_to_keep]


In [39]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   risk_score             8000 non-null   float64
 1   patient_id             8000 non-null   int64  
 2   age                    8000 non-null   int64  
 3   comorbidity_score      8000 non-null   float64
 4   chronic_ratio          8000 non-null   float64
 5   HYPERTENSION           8000 non-null   float64
 6   age_cat_senior         8000 non-null   bool   
 7   is_high_comorbidity    8000 non-null   int64  
 8   DIABETES               8000 non-null   float64
 9   patient_engagement     8000 non-null   int64  
 10  is_high_risk_age       8000 non-null   int64  
 11  visit_type_ER          8000 non-null   float64
 12  weighted_visits        8000 non-null   float64
 13  visit_type_ratio       8000 non-null   float64
 14  visit_diag_ratio       8000 non-null   float64
 15  tota

In [40]:
import matplotlib.pyplot as plt

def plot_imbalance_from_continuous(df, col, quantile=0.95):
    thr = df[col].quantile(quantile)
    df[f"{col}_flag"] = (df[col] >= thr).astype(int)

    counts = df[f"{col}_flag"].value_counts().sort_index()
    print(f"\nThreshold used ({quantile*100:.0f}th percentile): {thr:.3f}")
    print("Counts:\n", counts)
    print("\nPercentage:\n", counts / len(df))

    plt.figure()
    counts.plot(kind="bar")
    plt.title(f"Imbalance for {col} → threshold = {thr:.2f}")
    plt.xlabel(f"{col}_flag (0=low, 1=high)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

# Example usage:
#plot_imbalance_from_continuous(df_filtered, "Epigastric pain", quantile=0.95)


In [41]:
from sklearn.model_selection import train_test_split
from numpy import log

# Define features (X) and target (y)
X = df_filtered.drop(['patient_id', 'risk_score'], axis=1)
y = df_filtered['risk_score']

# Perform the initial train-test split (e.g., 80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temp set into validation and test sets (e.g., 50% validation, 50% test from temp)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (6400, 28)
Shape of X_val: (800, 28)
Shape of X_test: (800, 28)
Shape of y_train: (6400,)
Shape of y_val: (800,)
Shape of y_test: (800,)


In [42]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# --- X_train, X_test, y_train, y_test assumed ready ---

# Enable early stopping inside the estimator
base_gbr = GradientBoostingRegressor(
    random_state=42,
    n_iter_no_change=10,     # early stopping trigger
    validation_fraction=0.1, # 10% of train fold used for early stopping
    tol=1e-4
)

# Compact grid (kept small for speed, but strong)
param_grid_fast = {
    "n_estimators":    [300, 600],        # early stopping will cut these if unnecessary
    "learning_rate":   [0.05, 0.1],       # 0.05 often best with more trees
    "max_depth":       [2, 3],            # shallow trees generalize better
    "min_samples_leaf":[5, 10],           # regularization
    "subsample":       [0.6, 0.8],        # stochastic GB (faster + less overfit)
    "max_features":    ["sqrt", "log2"],  # additional randomness
    "loss":            ["squared_error", "huber"],  # Huber is robust to outliers
    "alpha":           [0.9],             # only used when loss='huber'
}

grid_gbr = GridSearchCV(
    estimator=base_gbr,
    param_grid=param_grid_fast,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=1
)

grid_gbr.fit(X_train, y_train)
print("Best params:", grid_gbr.best_params_)
print("Best CV R2:", round(grid_gbr.best_score_, 4))

Fitting 3 folds for each of 128 candidates, totalling 384 fits
Best params: {'alpha': 0.9, 'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'n_estimators': 300, 'subsample': 0.6}
Best CV R2: 0.4316


In [43]:
best_gbr = grid_gbr.best_estimator_
y_pred = best_gbr.predict(X_test)
print("\nTest R2 :", round(r2_score(y_test, y_pred), 4))
print("Test MAE:", round(mean_absolute_error(y_test, y_pred), 4))
print("Test RMSE:", round(mean_squared_error(y_test, y_pred), 4))


Test R2 : 0.5017
Test MAE: 0.8789
Test RMSE: 3.3265


In [44]:
import pickle

In [47]:
import pickle

best_model = grid_gbr.best_estimator_

with open("model_up.pkl", "wb") as f:
    pickle.dump(best_model, f)
