In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data.select_dtypes(include=['object']))
feature_names = encoder.get_feature_names(input_features=train_data.select_dtypes(include=['object']).columns)
train_encoded = pd.DataFrame(encoded_features, columns=feature_names)
train_data = train_data.drop(train_data.select_dtypes(include=['object']).columns, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data
encoded_features_test = encoder.transform(test_data.select_dtypes(include=['object']))
test_encoded = pd.DataFrame(encoded_features_test, columns=feature_names)
test_data = test_data.drop(test_data.select_dtypes(include=['object']).columns, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data.drop('hospital_death', axis=1), train_data['hospital_death'], test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Decision Tree with hyperparameter tuning
param_grid_tree = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 7]
}
clf_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_tree, cv=5)
clf_tree.fit(X_train, y_train)
print("Decision Tree Best Parameters:", clf_tree.best_params_)
y_val_pred_tree = clf_tree.predict(X_val)
print(f"Decision Tree Accuracy on validation set: {accuracy_score(y_val, y_val_pred_tree) * 100:.2f}%")
print(classification_report(y_val, y_val_pred_tree))

# Naive Bayes
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
print("\nNaive Bayes Parameters:", clf_nb.get_params())
y_val_pred_nb = clf_nb.predict(X_val)
print(f"Naive Bayes Accuracy on validation set: {accuracy_score(y_val, y_val_pred_nb) * 100:.2f}%")
print(classification_report(y_val, y_val_pred_nb))

# KNN with hyperparameter tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
clf_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
clf_knn.fit(X_train_scaled, y_train)
print("\nKNN Best Parameters:", clf_knn.best_params_)
y_val_pred_knn = clf_knn.predict(X_val_scaled)
print(f"KNN Accuracy on validation set: {accuracy_score(y_val, y_val_pred_knn) * 100:.2f}%")
print(classification_report(y_val, y_val_pred_knn))

print("\nProbability predictions for test set with high precision are saved in respective CSV files.")

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data.select_dtypes(include=['object']))
feature_names = encoder.get_feature_names(input_features=train_data.select_dtypes(include=['object']).columns)
train_encoded = pd.DataFrame(encoded_features, columns=feature_names)
train_data = train_data.drop(train_data.select_dtypes(include=['object']).columns, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data
encoded_features_test = encoder.transform(test_data.select_dtypes(include=['object']))
test_encoded = pd.DataFrame(encoded_features_test, columns=feature_names)
test_data = test_data.drop(test_data.select_dtypes(include=['object']).columns, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data.drop('hospital_death', axis=1), train_data['hospital_death'], test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Decision Tree with hyperparameter tuning
param_grid_tree = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 7]
}
clf_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_tree, cv=5)
clf_tree.fit(X_train, y_train)
print("Decision Tree Best Parameters:", clf_tree.best_params_)
y_val_pred_tree = clf_tree.predict(X_val)
print(f"Decision Tree Accuracy on validation set: {accuracy_score(y_val, y_val_pred_tree) * 100:.2f}%")
print(classification_report(y_val, y_val_pred_tree))

# Naive Bayes
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
print("\nNaive Bayes Parameters:", clf_nb.get_params())
y_val_pred_nb = clf_nb.predict(X_val)
print(f"Naive Bayes Accuracy on validation set: {accuracy_score(y_val, y_val_pred_nb) * 100:.2f}%")
print(classification_report(y_val, y_val_pred_nb))

# KNN with hyperparameter tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
clf_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
clf_knn.fit(X_train_scaled, y_train)
print("\nKNN Best Parameters:", clf_knn.best_params_)
y_val_pred_knn = clf_knn.predict(X_val_scaled)
print(f"KNN Accuracy on validation set: {accuracy_score(y_val, y_val_pred_knn) * 100:.2f}%")
print(classification_report(y_val, y_val_pred_knn))

print("\nProbability predictions for test set with high precision are saved in respective CSV files.")