In [4]:
import pandas as pd


train_path = "/kaggle/input/dataset-4/diabetic_data.csv"
ids_mapping_path = "/kaggle/input/dataset-4/IDS_mapping.csv"

# LOAD
df_train = pd.read_csv(train_path)
df_ids = pd.read_csv(ids_mapping_path)

# Quick look at the data
print("Train data shape:", df_train.shape)
print(df_train.head())
print("\nIDS mapping shape:", df_ids.shape)
print(df_ids.head())




Train data shape: (101766, 50)
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          

# Handling Missing Values

In [5]:

# Replace '?' with NaN for missing value consistency
df_train.replace('?', np.nan, inplace=True)
df_ids.replace('?', np.nan, inplace=True)

# -----------------------------
# 1️⃣ Missing values in train
# -----------------------------
print("=== Train Dataset Missing Values ===")
missing_train = df_train.isnull().sum()
missing_train = missing_train[missing_train > 0].sort_values(ascending=False)
print(missing_train)

print(f"\nTotal rows: {df_train.shape[0]}, Total columns: {df_train.shape[1]}")
print(f"Number of duplicate rows in train: {df_train.duplicated().sum()}")

# -----------------------------
# 2️⃣ Missing values in test/mapping
# -----------------------------
print("\n=== Test/Mapping Dataset Missing Values ===")
missing_ids = df_ids.isnull().sum()
missing_ids = missing_ids[missing_ids > 0].sort_values(ascending=False)
print(missing_ids)

print(f"\nTotal rows: {df_ids.shape[0]}, Total columns: {df_ids.shape[1]}")
print(f"Number of duplicate rows in test/mapping: {df_ids.duplicated().sum()}")


NameError: name 'np' is not defined

In [58]:
# Drop columns with very high missing values
cols_to_drop = ['weight', 'max_glu_serum', 'A1Cresult', 'medical_specialty', 'payer_code']
df_train.drop(columns=cols_to_drop, inplace=True)

# For remaining columns with few missing values, fill with 'Unknown' (categorical) or 0 (numeric)
cat_cols_fill = ['race', 'diag_1', 'diag_2', 'diag_3']
for col in cat_cols_fill:
    df_train[col] = df_train[col].fillna('Unknown')


KeyError: "['weight', 'max_glu_serum', 'A1Cresult', 'medical_specialty', 'payer_code'] not found in axis"

# EDA

In [7]:
# -----------------------------
# 1️⃣ Missing values in train (after dropping/filling)
# -----------------------------
print("=== Train Dataset Missing Values (After Cleaning) ===")
missing_train = df_train.isnull().sum()
missing_train = missing_train[missing_train > 0].sort_values(ascending=False)
if missing_train.empty:
    print("No missing values in train dataset ✅")
else:
    print(missing_train)

# -----------------------------
# 2️⃣ Missing values in test/mapping (after filling)
# -----------------------------
print("\n=== Test/Mapping Dataset Missing Values (After Cleaning) ===")
missing_ids = df_ids.isnull().sum()
missing_ids = missing_ids[missing_ids > 0].sort_values(ascending=False)
if missing_ids.empty:
    print("No missing values in test/mapping dataset ✅")
else:
    print(missing_ids)


=== Train Dataset Missing Values (After Cleaning) ===
No missing values in train dataset ✅

=== Test/Mapping Dataset Missing Values (After Cleaning) ===
description          5
admission_type_id    2
dtype: int64


In [8]:
# Clean and convert target
df_train['readmitted'] = df_train['readmitted'].str.strip()  # remove extra spaces
df_train['readmitted'] = df_train['readmitted'].apply(lambda x: 1 if x == '<30' else 0)
df_train['readmitted'] = df_train['readmitted'].astype(int)

# Check the conversion
print("Readmission counts after fixing target:")
print(df_train['readmitted'].value_counts())


Readmission counts after fixing target:
readmitted
0    90409
1    11357
Name: count, dtype: int64


In [9]:
# -----------------------------
# Numeric features summary by readmission
# -----------------------------
num_features = ['num_lab_procedures', 'num_procedures', 'num_medications', 
                'number_outpatient', 'number_emergency', 'number_inpatient', 
                'time_in_hospital', 'number_diagnoses']

print("\n--- Numeric Features Summary by Readmission ---")
for col in num_features:
    summary = df_train.groupby('readmitted')[col].describe()
    print(f"\nFeature: {col}")
    print(summary)

# Mean comparison for quick insights
print("\n--- Mean of numeric features by readmission ---")
print(df_train.groupby('readmitted')[num_features].mean())

# -----------------------------
# Categorical features readmission rate
# -----------------------------
cat_features = ['race', 'gender', 'age', 'change', 'diabetesMed']

print("\n--- Categorical Features Readmission Rate ---")
for col in cat_features:
    cross_tab = pd.crosstab(df_train[col], df_train['readmitted'], normalize='index')
    readmit_rate_col = cross_tab.get(1, pd.Series(0, index=cross_tab.index))
    print(f"\nReadmission rate by {col}:")
    print(readmit_rate_col.sort_values(ascending=False))



--- Numeric Features Summary by Readmission ---

Feature: num_lab_procedures
              count       mean        std  min   25%   50%   75%    max
readmitted                                                             
0           90409.0  42.953644  19.719348  1.0  31.0  44.0  57.0  129.0
1           11357.0  44.226028  19.276087  1.0  33.0  45.0  58.0  132.0

Feature: num_procedures
              count      mean       std  min  25%  50%  75%  max
readmitted                                                      
0           90409.0  1.347123  1.714242  0.0  0.0  1.0  2.0  6.0
1           11357.0  1.280884  1.635992  0.0  0.0  1.0  2.0  6.0

Feature: num_medications
              count       mean       std  min   25%   50%   75%   max
readmitted                                                           
0           90409.0  15.911137  8.124725  1.0  10.0  15.0  20.0  79.0
1           11357.0  16.903143  8.096696  1.0  11.0  16.0  21.0  81.0

Feature: number_outpatient
              c

# CORRELATIONS

In [10]:
# Compute correlations with target
numeric_features = ['num_lab_procedures', 'num_procedures', 'num_medications', 
                    'number_outpatient', 'number_emergency', 'number_inpatient', 
                    'time_in_hospital', 'number_diagnoses']

correlations = df_train[numeric_features + ['readmitted']].corr()['readmitted'].sort_values(ascending=False)
print("\n--- Correlation with Readmission ---")
print(correlations)



--- Correlation with Readmission ---
readmitted            1.000000
number_inpatient      0.165147
number_emergency      0.060747
number_diagnoses      0.049524
time_in_hospital      0.044199
num_medications       0.038432
num_lab_procedures    0.020364
number_outpatient     0.018893
num_procedures       -0.012227
Name: readmitted, dtype: float64


# Encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

# List of categorical features
cat_features = ['race', 'gender', 'age', 'change', 'diabetesMed']

# Encode each categorical feature
for col in cat_features:
    df_train[col] = df_train[col].astype(str)  # ensure string type
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])

# Check a few rows
print(df_train[cat_features + ['readmitted']].head())


   race  gender  age  change  diabetesMed  readmitted
0     3       0    0       1            0           0
1     3       0    1       0            1           0
2     1       0    2       1            1           0
3     3       1    3       0            1           0
4     3       1    4       0            1           0


# Outliers

2️⃣ Interpretation

Skewness > 0.5 → right-skewed (long tail to the right, outliers are high)

Skewness < -0.5 → left-skewed (long tail to the left, outliers are low)

-0.5 ≤ skewness ≤ 0.5 → approximately symmetric

In [15]:
# Numeric features to check
numeric_features = ['num_lab_procedures', 'num_procedures', 'num_medications', 
                    'number_outpatient', 'number_emergency', 'number_inpatient', 
                    'time_in_hospital', 'number_diagnoses']

# Compute skewness
skew_values = df_train[numeric_features].skew()
print("--- Skewness of Numeric Features ---")
print(skew_values)


--- Skewness of Numeric Features ---
num_lab_procedures    -0.236544
num_procedures         1.316415
num_medications        1.326672
number_outpatient      8.832959
number_emergency      22.855582
number_inpatient       3.614139
time_in_hospital       1.133999
number_diagnoses      -0.876746
dtype: float64


# Handling missing values

In [16]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Split column types
numeric_cols = X_encoded.select_dtypes(include=['int64','float64']).columns
categorical_cols = X_encoded.select_dtypes(include=['object']).columns
boolean_cols = X_encoded.select_dtypes(include=['bool']).columns

# --- 1. Numeric ---
if numeric_cols.any():
    num_imputer = SimpleImputer(strategy='median')
    X_encoded[numeric_cols] = pd.DataFrame(
        num_imputer.fit_transform(X_encoded[numeric_cols]),
        columns=numeric_cols,
        index=X_encoded.index
    )

# --- 2. Categorical ---
if categorical_cols.any():
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X_encoded[categorical_cols] = pd.DataFrame(
        cat_imputer.fit_transform(X_encoded[categorical_cols]),
        columns=categorical_cols,
        index=X_encoded.index
    )

# --- 3. Boolean ---
if boolean_cols.any():
    bool_imputer = SimpleImputer(strategy='most_frequent')
    X_encoded[boolean_cols] = pd.DataFrame(
        bool_imputer.fit_transform(X_encoded[boolean_cols].astype(int)),
        columns=boolean_cols,
        index=X_encoded.index
    )


NameError: name 'X_encoded' is not defined

In [17]:
import pandas as pd
from sklearn.impute import SimpleImputer

# X_encoded is your features DataFrame

# --- 1. Identify column types properly ---
numeric_cols = X_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_encoded.select_dtypes(include=['object']).columns.tolist()
boolean_cols = X_encoded.select_dtypes(include=['bool']).columns.tolist()

print(f"Numeric: {len(numeric_cols)}, Categorical: {len(categorical_cols)}, Boolean: {len(boolean_cols)}")

# --- 2. Impute numeric columns (median) ---
if numeric_cols:
    num_imputer = SimpleImputer(strategy='median')
    X_encoded[numeric_cols] = num_imputer.fit_transform(X_encoded[numeric_cols])

# --- 3. Impute categorical columns (most frequent) ---
if categorical_cols:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X_encoded[categorical_cols] = cat_imputer.fit_transform(X_encoded[categorical_cols])

# --- 4. Impute boolean columns (most frequent) ---
if boolean_cols:
    bool_imputer = SimpleImputer(strategy='most_frequent')
    # Convert boolean to object before imputing to avoid errors
    X_encoded[boolean_cols] = bool_imputer.fit_transform(X_encoded[boolean_cols].astype(object))

# --- 5. Final check ---
total_nans = X_encoded.isna().sum().sum()
print(f"Total NaNs after imputation: {total_nans}")  # Should be 0


NameError: name 'X_encoded' is not defined

# Train/Test SPLIT

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score


In [19]:
X = df_train.drop('readmitted', axis=1)
y = df_train['readmitted']


In [20]:
# Stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Example loop to show how to split
for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Just printing counts for demonstration
    print(f"Fold {fold}:")
    print("Train class distribution:", y_train_fold.value_counts(normalize=True).to_dict())
    print("Validation class distribution:", y_val_fold.value_counts(normalize=True).to_dict())
    print("-"*40)


Fold 1:
Train class distribution: {0: 0.8884071144303051, 1: 0.11159288556969489}
Validation class distribution: {0: 0.888375749238479, 1: 0.11162425076152108}
----------------------------------------
Fold 2:
Train class distribution: {0: 0.8883962020807488, 1: 0.11160379791925122}
Validation class distribution: {0: 0.8884193976317988, 1: 0.11158060236820125}
----------------------------------------
Fold 3:
Train class distribution: {0: 0.8883962020807488, 1: 0.11160379791925122}
Validation class distribution: {0: 0.8884193976317988, 1: 0.11158060236820125}
----------------------------------------
Fold 4:
Train class distribution: {0: 0.8883962020807488, 1: 0.11160379791925122}
Validation class distribution: {0: 0.8884193976317988, 1: 0.11158060236820125}
----------------------------------------
Fold 5:
Train class distribution: {0: 0.8884084851313673, 1: 0.11159151486863278}
Validation class distribution: {0: 0.8883702648258242, 1: 0.11162973517417579}
--------------------------------

**Checking is there any NAN left**

In [37]:
# Ensure all categorical features are numeric
X_final = X_selected.copy()

for col in X_final.columns:
    if X_final[col].dtype == 'object' or X_final[col].dtype == 'bool':
        X_final[col] = pd.factorize(X_final[col])[0]

# Confirm no NaNs
print("Total NaNs:", X_final.isna().sum().sum())


Total NaNs: 0


# SMOTE

In [21]:
!pip install imbalanced-learn==0.10.1


Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.10.1


In [22]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.11.0


Collecting imbalanced-learn==0.11.0
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.11.0


In [23]:
from imblearn.over_sampling import SMOTE


In [24]:
# Convert all categorical columns to numeric (if not done yet)
X_numeric = pd.get_dummies(X, drop_first=True)


In [25]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_numeric, y), 1):
    X_train_fold, X_val_fold = X_numeric.iloc[train_idx], X_numeric.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Apply SMOTE
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Now X_train_res is fully numeric and SMOTE will work


In [26]:
# Encode all categorical columns upfront
X_encoded = pd.get_dummies(X, drop_first=True)


In [27]:
import xgboost as xgb

clf = xgb.XGBClassifier(
    n_estimators=300,
    scale_pos_weight=(len(y_train_fold)-sum(y_train_fold))/sum(y_train_fold),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)


In [28]:
import xgboost as xgb
from sklearn.metrics import f1_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

# Already have X_encoded and y
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_encoded, y), 1):
    X_train_fold = X_encoded.iloc[train_idx]
    X_val_fold = X_encoded.iloc[val_idx]

    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    # SMOTE on training fold
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Convert back to DataFrame and align columns
    X_train_res = pd.DataFrame(X_train_res, columns=X_train_fold.columns)
    
    # Initialize XGBoost classifier with imbalance handling
    clf = xgb.XGBClassifier(
        n_estimators=300,
        scale_pos_weight=(len(y_train_fold)-sum(y_train_fold))/sum(y_train_fold),
        use_label_encoder=False,
        eval_metric='logloss',
        tree_method='gpu_hist',  # use GPU
        random_state=42
    )
    
    clf.fit(X_train_res, y_train_res)
    
    # Predict on validation (columns already aligned)
    y_pred = clf.predict(X_val_fold)
    
    f1 = f1_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fold 1: F1 = 0.2408, Recall = 0.4555



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 2: F1 = 0.2520, Recall = 0.4817



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 3: F1 = 0.2429, Recall = 0.4509



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 4: F1 = 0.2401, Recall = 0.4412



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 5: F1 = 0.2433, Recall = 0.4494


# Feature Selection

**BY FILTER METHOD**

In [29]:
from sklearn.feature_selection import VarianceThreshold

# Remove features with very low variance (almost constant)
selector = VarianceThreshold(threshold=0.01)  # Keep features with >1% variance
X_filtered = selector.fit_transform(X_encoded)

# Keep column names
X_filtered = pd.DataFrame(X_filtered, columns=X_encoded.columns[selector.get_support()])
print("Remaining features after low-variance filter:", X_filtered.shape[1])


Remaining features after low-variance filter: 103


**BY EMBEDDED METHOD**

In [30]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Train a Random Forest on the filtered features
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_filtered, y)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X_filtered.columns)
importances = importances.sort_values(ascending=False)

# Show top 20 features
print("Top 20 features by importance:")
print(importances.head(20))

# Optionally, select features above a threshold
threshold = 0.005  # keep features with importance > 0.5%
top_features = importances[importances > threshold].index
X_selected = X_filtered[top_features]
print("Features selected for modeling:", X_selected.shape[1])


Top 20 features by importance:
encounter_id                0.089830
patient_nbr                 0.088401
num_lab_procedures          0.075616
num_medications             0.066207
time_in_hospital            0.047171
number_inpatient            0.041132
age                         0.040743
num_procedures              0.033314
discharge_disposition_id    0.033051
number_diagnoses            0.030588
admission_type_id           0.024635
admission_source_id         0.021196
race                        0.019513
gender                      0.016918
number_outpatient           0.016511
number_emergency            0.014749
change                      0.010869
insulin_Steady              0.010480
insulin_No                  0.008445
diag_2_276                  0.007403
dtype: float64
Features selected for modeling: 37


------------------------------------

**DROPPEDE THE IDENTIFIERS**

In [31]:
# Drop identifiers
X_filtered = X_encoded.drop(columns=['encounter_id', 'patient_nbr'], errors='ignore')

(excluding IDs)
top_features = [
    'num_lab_procedures', 'num_medications', 'time_in_hospital', 'number_inpatient',
    'age', 'num_procedures', 'discharge_disposition_id', 'number_diagnoses',
    'admission_type_id', 'admission_source_id', 'race', 'gender',
    'number_outpatient', 'number_emergency', 'change',
    'insulin_Steady', 'insulin_No', 'diag_2_276'
]

# Filter to keep only top predictive features
X_filtered = X_filtered[top_features]

print(f"Remaining features for modeling: {X_filtered.shape[1]}")


Remaining features for modeling: 18


# ENCODING

In [33]:
# One-hot encode insulin and diag_2
df_encoded = pd.get_dummies(df_train, columns=['insulin', 'diag_2'], drop_first=False)

# Now we can safely select top features
X_selected = df_encoded[top_features].copy()
y = df_encoded['readmitted']

# Check
print("Columns available in X_selected:")
print(X_selected.columns)


Columns available in X_selected:
Index(['num_lab_procedures', 'num_medications', 'time_in_hospital',
       'number_inpatient', 'age', 'num_procedures', 'discharge_disposition_id',
       'number_diagnoses', 'admission_type_id', 'admission_source_id', 'race',
       'gender', 'number_outpatient', 'number_emergency', 'change',
       'insulin_Steady', 'insulin_No', 'diag_2_276'],
      dtype='object')


In [34]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# X_selected already contains only the top features
# y is the target

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, y), 1):
    X_train_fold, X_val_fold = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Apply SMOTE only on training fold
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Convert back to DataFrame to keep column names aligned
    X_train_res = pd.DataFrame(X_train_res, columns=X_train_fold.columns)
    
    # Initialize XGBoost classifier with GPU
    clf = xgb.XGBClassifier(
        n_estimators=300,
        scale_pos_weight=(len(y_train_fold) - sum(y_train_fold)) / sum(y_train_fold),
        use_label_encoder=False,
        eval_metric='logloss',
        tree_method='gpu_hist',
        random_state=42
    )
    
    clf.fit(X_train_res, y_train_res)
    
    # Predict on validation fold
    y_pred = clf.predict(X_val_fold)
    
    # Evaluate
    f1 = f1_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 1: F1 = 0.2209, Recall = 0.6364



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 2: F1 = 0.2195, Recall = 0.6420



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 3: F1 = 0.2204, Recall = 0.6354



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 4: F1 = 0.2206, Recall = 0.6398



    E.g. tree_method = "hist", device = "cuda"



Fold 5: F1 = 0.2139, Recall = 0.6215



    E.g. tree_method = "hist", device = "cuda"



# Hyper Parameter tunning


In [35]:
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Assume X_selected and y are already defined
X = X_selected.copy()
y = y.copy()

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "tree_method": "hist",
        "device": "cuda",
        "eval_metric": "logloss",
        "use_label_encoder": False,
        "random_state": 42,
        "scale_pos_weight": (len(y) - sum(y)) / sum(y)  # handle imbalance
    }

    f1_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Apply SMOTE on training fold
        smote = SMOTE(sampling_strategy=0.5, random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

        # Train XGBoost
        clf = xgb.XGBClassifier(**params)
        clf.fit(X_train_res, y_train_res)

        # Predict on validation fold
        y_pred = clf.predict(X_val)
        f1_scores.append(f1_score(y_val, y_pred))

    return np.mean(f1_scores)

# Create study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # adjust number of trials as needed

print("Best F1 score:", study.best_value)
print("Best hyperparameters:", study.best_params)


[I 2025-10-20 09:14:41,566] A new study created in memory with name: no-name-a48ec16c-0a8e-4f41-a2bf-bf09d62df491
[I 2025-10-20 09:14:58,090] Trial 0 finished with value: 0.22037309015809878 and parameters: {'n_estimators': 354, 'max_depth': 7, 'learning_rate': 0.03927104873976637, 'subsample': 0.7906258008049099, 'colsample_bytree': 0.6810623716902173, 'gamma': 1.8855120828360028, 'min_child_weight': 5}. Best is trial 0 with value: 0.22037309015809878.
[I 2025-10-20 09:15:12,210] Trial 1 finished with value: 0.21858504123636618 and parameters: {'n_estimators': 462, 'max_depth': 5, 'learning_rate': 0.1875713015315927, 'subsample': 0.9995714125616725, 'colsample_bytree': 0.9060666638488061, 'gamma': 2.248481945385719, 'min_child_weight': 10}. Best is trial 0 with value: 0.22037309015809878.
[I 2025-10-20 09:15:26,571] Trial 2 finished with value: 0.21703825756282194 and parameters: {'n_estimators': 459, 'max_depth': 3, 'learning_rate': 0.1504836851580768, 'subsample': 0.6269233679340785

Best F1 score: 0.22678774198925095
Best hyperparameters: {'n_estimators': 367, 'max_depth': 10, 'learning_rate': 0.048565317749822264, 'subsample': 0.7561242512693747, 'colsample_bytree': 0.878609846001877, 'gamma': 3.5424762662629763, 'min_child_weight': 10}


**Best F1 score: 0.22678774198925095
Best hyperparameters: {'n_estimators': 367, 'max_depth': 10, 'learning_rate': 0.048565317749822264, 'subsample': 0.7561242512693747, 'colsample_bytree': 0.878609846001877, 'gamma': 3.5424762662629763, 'min_child_weight': 10}**

In [41]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# Assume X_selected and y are your features and target (train dataset)
X = X_selected.copy()
y = df_train['readmitted']

# Stratified 5-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Best hyperparameters from Optuna
best_params = {
    'n_estimators': 367,
    'max_depth': 10,
    'learning_rate': 0.048565317749822264,
    'subsample': 0.7561242512693747,
    'colsample_bytree': 0.878609846001877,
    'gamma': 3.5424762662629763,
    'min_child_weight': 10
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    # Split fold
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE only on training fold
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Convert back to DataFrame for alignment
    X_train_res = pd.DataFrame(X_train_res, columns=X_train_fold.columns)

    # Initialize XGBoost with best hyperparameters
    clf = xgb.XGBClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        subsample=best_params['subsample'],
        colsample_bytree=best_params['colsample_bytree'],
        gamma=best_params['gamma'],
        min_child_weight=best_params['min_child_weight'],
        eval_metric='logloss',
        use_label_encoder=False,
        tree_method='hist',   # Use GPU
        device='cuda',        # Correct GPU parameter
        random_state=42
    )

    # Train on resampled training fold
    clf.fit(X_train_res, y_train_res)

    # Predict on validation fold
    y_pred = clf.predict(X_val_fold)

    # Evaluate
    f1 = f1_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")


Fold 1: F1 = 0.1663, Recall = 0.1470
Fold 2: F1 = 0.1763, Recall = 0.1541
Fold 3: F1 = 0.1545, Recall = 0.1343
Fold 4: F1 = 0.1769, Recall = 0.1576
Fold 5: F1 = 0.1599, Recall = 0.1413


In [43]:
# Make sure we only use columns that exist in df_train
available_features = [col for col in top_features if col in df_train.columns]
X = df_train[available_features].copy()
y = df_train['readmitted']

print("Using features for modeling:", available_features)


Using features for modeling: ['num_lab_procedures', 'num_medications', 'time_in_hospital', 'number_inpatient', 'age', 'num_procedures', 'discharge_disposition_id', 'number_diagnoses', 'admission_type_id', 'admission_source_id', 'race', 'gender', 'number_outpatient', 'number_emergency', 'change']


# BASELINE Evaluation (XGboost)


In [44]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# X and y from above
# X = df_train[available_features].copy()
# y = df_train['readmitted']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Apply SMOTE to training fold
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    X_train_res = pd.DataFrame(X_train_res, columns=X_train_fold.columns)
    
    # Initialize XGBoost classifier with GPU support
    clf = xgb.XGBClassifier(
        n_estimators=300,
        scale_pos_weight=(len(y_train_fold)-sum(y_train_fold))/sum(y_train_fold),
        tree_method='gpu_hist',  # GPU acceleration
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    
    # Train
    clf.fit(X_train_res, y_train_res)
    
    # Predict on validation fold
    y_pred = clf.predict(X_val_fold)
    
    # Evaluate
    f1 = f1_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 1: F1 = 0.2137, Recall = 0.6840



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 2: F1 = 0.2192, Recall = 0.7001



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 3: F1 = 0.2149, Recall = 0.6794



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Fold 4: F1 = 0.2123, Recall = 0.6746



    E.g. tree_method = "hist", device = "cuda"



Fold 5: F1 = 0.2118, Recall = 0.6809



    E.g. tree_method = "hist", device = "cuda"



# BASELINE Evaluation (Lightgbm)

In [45]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# Use only your selected features
features = ['num_lab_procedures', 'num_medications', 'time_in_hospital',
            'number_inpatient', 'age', 'num_procedures', 'discharge_disposition_id',
            'number_diagnoses', 'admission_type_id', 'admission_source_id',
            'race', 'gender', 'number_outpatient', 'number_emergency', 'change']

X = df_train[features]
y = df_train['readmitted']

# Stratified 5-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Apply SMOTE on training fold
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    # Initialize LightGBM classifier
    clf = lgb.LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        device='gpu'  # Use GPU
    )
    
    # Train on resampled training data
    clf.fit(X_train_res, y_train_res)
    
    # Predict on validation fold
    y_pred = clf.predict(X_val)
    
    # Metrics
    f1 = f1_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")


[LightGBM] [Info] Number of positive: 36163, number of negative: 72327
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 385
[LightGBM] [Info] Number of data points in the train set: 108490, number of used features: 15
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (1.24 MB) transferred to GPU in 0.002522 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333330 -> initscore=-0.693161
[LightGBM] [Info] Start training from score -0.693161
Fold 1: F1 = 0.1684, Recall = 0.1483
[LightGBM] [Info] Number of positive: 36163, number of negative: 72327
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 108490, number of used features: 15
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (1.24 MB) transferred to GPU in 0.002442 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromSc

# Hyper Parameter tunning (lightGBM)

In [46]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# X_selected and y are your train features and target
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'device': 'gpu',          # GPU training
        'metric': 'binary_logloss',
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0)
    }

    f1_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train_fold, X_val_fold = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        # Apply SMOTE on training fold
        smote = SMOTE(sampling_strategy=0.5, random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
        X_train_res = pd.DataFrame(X_train_res, columns=X_train_fold.columns)

        model = lgb.LGBMClassifier(**params)
        model.fit(X_train_res, y_train_res)

        y_pred = model.predict(X_val_fold)
        f1_scores.append(f1_score(y_val_fold, y_pred))

    return sum(f1_scores)/len(f1_scores)

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best F1 score:", study.best_value)
print("Best hyperparameters:", study.best_params)


[I 2025-10-20 09:46:25,707] A new study created in memory with name: no-name-4bb11c9c-c5e6-4b73-9829-59b698879144
[I 2025-10-20 09:47:18,161] Trial 0 finished with value: 0.16471021044474612 and parameters: {'n_estimators': 640, 'learning_rate': 0.038466843497743396, 'num_leaves': 70, 'max_depth': 11, 'min_child_samples': 40, 'subsample': 0.8855838799832151, 'colsample_bytree': 0.58374142923337, 'reg_alpha': 4.126469015115222, 'reg_lambda': 4.634034424705844}. Best is trial 0 with value: 0.16471021044474612.
[I 2025-10-20 09:48:33,882] Trial 1 finished with value: 0.16628989681065723 and parameters: {'n_estimators': 690, 'learning_rate': 0.02839113874041702, 'num_leaves': 134, 'max_depth': 13, 'min_child_samples': 8, 'subsample': 0.7763504180066623, 'colsample_bytree': 0.997836437633665, 'reg_alpha': 4.961971301992701, 'reg_lambda': 1.682403834519432}. Best is trial 1 with value: 0.16628989681065723.
[I 2025-10-20 09:48:48,147] Trial 2 finished with value: 0.1350179051371097 and parame

Best F1 score: 0.17545255556207004
Best hyperparameters: {'n_estimators': 875, 'learning_rate': 0.09884213396280735, 'num_leaves': 77, 'max_depth': 4, 'min_child_samples': 77, 'subsample': 0.6573004980884636, 'colsample_bytree': 0.8560838492917963, 'reg_alpha': 1.297857471332764, 'reg_lambda': 3.4131333456386432}


**After , now using tunned Parameters**

In [47]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# ✅ Your best Optuna hyperparameters
best_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'device': 'gpu',  # GPU enabled
    'metric': 'binary_logloss',
    'verbosity': -1,
    'n_estimators': 875,
    'learning_rate': 0.09884213396280735,
    'num_leaves': 77,
    'max_depth': 4,
    'min_child_samples': 77,
    'subsample': 0.6573004980884636,
    'colsample_bytree': 0.8560838492917963,
    'reg_alpha': 1.297857471332764,
    'reg_lambda': 3.4131333456386432
}

# 5-Fold Stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores, recall_scores = [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, y), 1):
    X_train_fold, X_val_fold = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE on training data
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    X_train_res = pd.DataFrame(X_train_res, columns=X_train_fold.columns)

    # Train LGBM model
    model = lgb.LGBMClassifier(**best_params)
    model.fit(X_train_res, y_train_res)

    # Predict on validation set
    y_pred = model.predict(X_val_fold)

    # Evaluate
    f1 = f1_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1_scores.append(f1)
    recall_scores.append(recall)

    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")

# Average results
print("\nAverage F1:", sum(f1_scores)/len(f1_scores))
print("Average Recall:", sum(recall_scores)/len(recall_scores))


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Fold 1: F1 = 0.1773, Recall = 0.1633
Fold 2: F1 = 0.1840, Recall = 0.1682
Fold 3: F1 = 0.1693, Recall = 0.1519
Fold 4: F1 = 0.1817, Recall = 0.1664
Fold 5: F1 = 0.1650, Recall = 0.1510

Average F1: 0.17545255556207004
Average Recall: 0.16016607128459884


# ENSEMBLING

In [57]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score
import numpy as np

# Define base models with best hyperparameters
xgb_model = xgb.XGBClassifier(
    n_estimators=367,
    max_depth=10,
    learning_rate=0.048565317749822264,
    subsample=0.7561242512693747,
    colsample_bytree=0.878609846001877,
    gamma=3.5424762662629763,
    min_child_weight=10,
    random_state=42,
    tree_method="hist",
    device="cuda"
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=875,
    learning_rate=0.09884213396280735,
    num_leaves=77,
    max_depth=4,
    min_child_samples=77,
    subsample=0.6573004980884636,
    colsample_bytree=0.8560838492917963,
    reg_alpha=1.297857471332764,
    reg_lambda=3.4131333456386432,
    device="gpu",
    random_state=42
)

# Meta-model
meta_model = LogisticRegression(max_iter=500, solver='lbfgs')

# Prepare cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores, recall_scores = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_selected, y), 1):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Train base models
    xgb_model.fit(X_train, y_train)
    lgb_model.fit(X_train, y_train)

    # Predict probabilities
    xgb_pred = xgb_model.predict_proba(X_val)[:, 1]
    lgb_pred = lgb_model.predict_proba(X_val)[:, 1]

    # Stack predictions as meta-features
    meta_X_train = np.vstack((xgb_pred, lgb_pred)).T

    # Fit meta-model
    meta_model.fit(meta_X_train, y_val)

    # Final predictions
    meta_preds = meta_model.predict(meta_X_train)

    f1 = f1_score(y_val, meta_preds)
    recall = recall_score(y_val, meta_preds)
    f1_scores.append(f1)
    recall_scores.append(recall)

    print(f"Fold {fold}: F1 = {f1:.4f}, Recall = {recall:.4f}")

print(f"\nAverage F1: {np.mean(f1_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")


Fold 1: F1 = 0.0507, Recall = 0.0268
Fold 2: F1 = 0.0620, Recall = 0.0330
Fold 3: F1 = 0.0427, Recall = 0.0225
Fold 4: F1 = 0.0523, Recall = 0.0277
Fold 5: F1 = 0.0362, Recall = 0.0189

Average F1: 0.0488
Average Recall: 0.0258
