In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures

# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data.select_dtypes(include=['object']))
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(train_data.select_dtypes(include=['object']).columns))
train_data = train_data.drop(train_data.select_dtypes(include=['object']).columns, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data.select_dtypes(include=['object']))
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(test_data.select_dtypes(include=['object']).columns))
test_data = test_data.drop(test_data.select_dtypes(include=['object']).columns, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Feature Selection based on importance
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
selected_features = [X_train.columns[i] for i in indices[:50]]  # selecting top 50 features
X_train = X_train[selected_features]
X_val = X_val[selected_features]
test_data = test_data[selected_features]

# Create polynomial features
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
test_data_poly = poly.transform(test_data)

# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'ccp_alpha': [0.0, 0.01, 0.02, 0.03, 0.04]
}

# Initialize the DecisionTree classifier
clf = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
grid_search.fit(X_train_poly, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the DecisionTree with the best parameters
best_tree = DecisionTreeClassifier(**best_params, random_state=42)
best_tree.fit(X_train_poly, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = best_tree.predict(X_val_poly)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with best parameters: {val_accuracy *100:.2f}%")

# Predict on the test set
y_test_proba = best_tree.predict_proba(test_data_poly)[:, 1]

# Save the predictions to a CSV file with maximum precision
record_ids = test_data['RecordID'].astype(int)
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target.csv', index=False, float_format='%.15f')

print("Probability predictions for test set with high precision are saved in target.csv")
