In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier

# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

train_data.drop(['RecordID'], axis=1, inplace=True)
# train_data.drop(['hospital_id'], axis=1, inplace=True)
# train_data.drop(['icu_id'], axis=1, inplace=True)
# train_data.drop(['icu_stay_type'], axis=1, inplace=True)
# train_data.drop(['icu_type'], axis=1, inplace=True)
# train_data.drop(['ethnicity'], axis=1, inplace=True)

test_data.drop(['RecordID'], axis=1, inplace=True)
# test_data.drop(['hospital_id'], axis=1, inplace=True)
# test_data.drop(['icu_id'], axis=1, inplace=True)
# test_data.drop(['icu_stay_type'], axis=1, inplace=True)
# test_data.drop(['icu_type'], axis=1, inplace=True)
# test_data.drop(['ethnicity'], axis=1, inplace=True)

# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.mode())
test_data = test_data.fillna(train_data.mode())
# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]





XGB

In [None]:


# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Define the hyperparameters and their possible values for XGBoost
param_grid = {
    'n_estimators': [150,200,250],
    'max_depth': [1,5,10],
    'learning_rate': [0.05,0.5,1],
    'min_child_weight': [1],
    'subsample': [0.7],
}

# Initialize the XGBoost classifier
clf = XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the XGBoost classifier with the best parameters
best_xgb = XGBClassifier(**best_params, random_state=42)
best_xgb.fit(X_train, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = best_xgb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with best parameters: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = best_xgb.predict_proba(test_data)[:, 1]

# Save the predictions to a CSV file with maximum precision
record_ids = test_data['RecordID'].astype(int)
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-xgboost.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-xgboost.csv")


CATBOOST

In [None]:

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Define the hyperparameters and their possible values for CatBoost
param_grid = {
    'n_estimators': [150,200,250],
    'max_depth': [5,7,10],
    'learning_rate': [0.05, 0.1],
    'min_child_samples': [1, 2, 3],  # Equivalent to min_child_weight in XGBoost
    'subsample': [0.7,0.8],
}

# Initialize the CatBoost classifier
clf = CatBoostClassifier(random_state=42, verbose=0)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the CatBoost classifier with the best parameters
best_catboost = CatBoostClassifier(**best_params, random_state=42, verbose=0)
best_catboost.fit(X_train, y_train)

# Feature importance analysis
feature_importance = best_catboost.get_feature_importance()

# Predict and compute accuracy on the validation set
y_val_pred = best_catboost.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with best parameters: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = best_catboost.predict_proba(test_data)[:, 1]

# Save the predictions to a CSV file with maximum precision
record_ids = test_data['RecordID'].astype(int)
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-catboost.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-catboost.csv")