In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

train_data.drop(['RecordID'], axis=1, inplace=True)
test_data.drop(['RecordID'], axis=1, inplace=True)

# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.mode().iloc[0])
test_data = test_data.fillna(train_data.mode().iloc[0])

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Stacking
estimators = [
    ('xgb', XGBClassifier(random_state=42)),
    ('catboost', CatBoostClassifier(random_state=42, verbose=0)),
    ('extree', ExtraTreesClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
]

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=CatBoostClassifier(random_state=42, verbose=0))
stacking_clf.fit(X_train_scaled, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = stacking_clf.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with stacking: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = stacking_clf.predict_proba(test_data_scaled)[:, 1]

# Extract record_ids before dropping the 'RecordID' column
record_ids = test_data['RecordID'].astype(int)

# Save the predictions to a CSV file with maximum precision
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-stacking.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-stacking.csv")

With PCA

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

# Store the 'RecordID' column in a separate variable before dropping it
record_ids = test_data['RecordID'].astype(int)

train_data.drop(['RecordID'], axis=1, inplace=True)
test_data.drop(['RecordID'], axis=1, inplace=True)

# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.mode().iloc[0])
test_data = test_data.fillna(train_data.mode().iloc[0])

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Apply PCA, scaling, and normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
test_data_pca = pca.transform(test_data_scaled)

# Stacking
estimators = [
    ('xgb', XGBClassifier(n_estimators=150, random_state=42)),
    ('catboost', CatBoostClassifier(n_estimators=150, verbose=0, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=150, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=150, random_state=42))
]

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=CatBoostClassifier(n_estimators=150, verbose=0, random_state=42))
stacking_clf.fit(X_train_pca, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = stacking_clf.predict(X_val_pca)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with stacking: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = stacking_clf.predict_proba(test_data_pca)[:, 1]

# Save the predictions to a CSV file with maximum precision
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-stacking.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-stacking.csv")