Import modules & Loading data

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")

test_ids = test["trustii_id"].copy()

TARGETS = ["OUTCOME SEVERITY", "OUTCOME MACE"]

X = train.drop(columns=TARGETS + ["ID"], errors="ignore")
y_sev  = train["OUTCOME SEVERITY"]
y_mace = train["OUTCOME MACE"]

X_test = test[X.columns]


Imputation

In [None]:
imputer = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test_imp = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

SNP SELECTION + PRS

In [None]:
snp_cols = [c for c in X.columns if c.startswith("SNP")]
priority_snps = [c for c in snp_cols if int(c[3:]) <= 75]

# --- Logistic Regression importance
scaler = StandardScaler()
X_scaled_snps = pd.DataFrame(
    scaler.fit_transform(X_imp[priority_snps]),
    columns=priority_snps
)

lr = LogisticRegression(
    solver="liblinear",
    max_iter=2000
)
lr.fit(X_scaled_snps, y_sev)

lr_importance = pd.Series(np.abs(lr.coef_[0]), index=priority_snps)

# --- LightGBM importance
lgb_fs = lgb.LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=31,
    verbose=-1
)
lgb_fs.fit(X_imp[priority_snps], y_sev)

lgb_importance = pd.Series(
    lgb_fs.feature_importances_,
    index=priority_snps
)

# --- Consensus ranking
lr_norm  = lr_importance / (lr_importance.max() + 1e-9)
lgb_norm = lgb_importance / (lgb_importance.max() + 1e-9)

consensus = 0.5 * lr_norm + 0.5 * lgb_norm

TOP_K = 40
top_snps = consensus.sort_values(ascending=False).head(TOP_K).index.tolist()

# --- Build PRS
weights = lr_importance[top_snps]

X_imp["PRS"] = X_imp[top_snps].values @ weights.values
X_test_imp["PRS"] = X_test_imp[top_snps].values @ weights.values

FEATURE SETS

In [None]:
sev_features = ["PRS"] + top_snps + ["Age_Baseline", "Genre"]

mace_features = ["PRS"] + top_snps + [
    "Age_Baseline", "Genre",
    "Epaiss_max", "Gradient", "FEVG",
    "TVNS", "SYNCOPE"
]


SEVERITY MODEL (WEIGHTED)

In [None]:
sev_model = lgb.LGBMClassifier(
    n_estimators=900,
    learning_rate=0.02,
    num_leaves=31,
    class_weight={0: 1.5, 1: 1.0},
    verbose=-1
)

sev_model.fit(X_imp[sev_features], y_sev)

sev_pred = sev_model.predict_proba(X_test_imp[sev_features])[:, 1]

MACE MODEL (ORDINAL via REGRESSION)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_imp[mace_features],
    y_mace,
    test_size=0.25,
    random_state=42,
    stratify=y_mace
)

mace_model = lgb.LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=31,
    verbose=-1
)

mace_model.fit(X_tr, y_tr)

# --- Threshold tuning for QWK
val_scores = mace_model.predict(X_val)

best_qwk = -1
best_t1, best_t2 = 0.7, 1.5

for t1 in np.arange(0.4, 1.2, 0.05):
    for t2 in np.arange(1.2, 2.0, 0.05):
        preds = np.zeros_like(val_scores, dtype=int)
        preds[val_scores >= t1] = 1
        preds[val_scores >= t2] = 2

        qwk = cohen_kappa_score(
            y_val, preds,
            weights="quadratic"
        )

        if qwk > best_qwk:
            best_qwk = qwk
            best_t1, best_t2 = t1, t2

print(f"Best QWK (val): {best_qwk:.4f}")
print(f"Best thresholds: t1={best_t1:.2f}, t2={best_t2:.2f}")

# --- Predict test
mace_scores = mace_model.predict(X_test_imp[mace_features])

mace_pred = np.zeros_like(mace_scores, dtype=int)
mace_pred[mace_scores >= best_t1] = 1
mace_pred[mace_scores >= best_t2] = 2

SUBMISSION

In [None]:
submission = pd.DataFrame({
    "trustii_id": test_ids,
    "OUTCOME MACE": mace_pred,
    "OUTCOME SEVERITY": sev_pred
})

submission.to_csv("submission.csv", index=False)

print("âœ… submission.csv created successfully")