In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler


Load datasets

In [None]:
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)


In [None]:


#One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data.select_dtypes(include=['object']))
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(train_data.select_dtypes(include=['object']).columns))
train_data = train_data.drop(train_data.select_dtypes(include=['object']).columns, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

#One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data.select_dtypes(include=['object']))
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(test_data.select_dtypes(include=['object']).columns))
test_data = test_data.drop(test_data.select_dtypes(include=['object']).columns, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

#Handle missing values
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())

#Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

#Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data.drop('hospital_death', axis=1), train_data['hospital_death'], test_size=0.2, random_state=42)

#Standardize the data for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Decision Tree
clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(X_train, y_train)
y_val_pred_tree = clf_tree.predict(X_val)
val_accuracy_tree = accuracy_score(y_val, y_val_pred_tree)
print(f"Decision Tree Accuracy on validation set: {val_accuracy_tree * 100:.2f}%")
y_test_proba_tree = clf_tree.predict_proba(test_data)[:, 1]
output_df_tree = pd.DataFrame({'RecordID': test_data['RecordID'].astype(int), 'hospital_death': y_test_proba_tree})
output_df_tree.to_csv('decision_tree_target.csv', index=False, float_format='%.15f')

# Naive Bayes
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
y_val_pred_nb = clf_nb.predict(X_val)
val_accuracy_nb = accuracy_score(y_val, y_val_pred_nb)
print(f"Naive Bayes Accuracy on validation set: {val_accuracy_nb * 100:.2f}%")
y_test_proba_nb = clf_nb.predict_proba(test_data)[:, 1]
output_df_nb = pd.DataFrame({'RecordID': test_data['RecordID'].astype(int), 'hospital_death': y_test_proba_nb})
output_df_nb.to_csv('naive_bayes_target.csv', index=False, float_format='%.15f')

# KNN
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train_scaled, y_train)
y_val_pred_knn = clf_knn.predict(X_val_scaled)
val_accuracy_knn = accuracy_score(y_val, y_val_pred_knn)
print(f"KNN Accuracy on validation set: {val_accuracy_knn * 100:.2f}%")
y_test_proba_knn = clf_knn.predict_proba(test_data_scaled)[:, 1]
output_df_knn = pd.DataFrame({'RecordID': test_data['RecordID'].astype(int), 'hospital_death': y_test_proba_knn})
output_df_knn.to_csv('KNN_target.csv', index=False, float_format='%.15f')

print("Probability predictions for test set with high precision are saved in respective CSV files.")