In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    VotingClassifier, 
    BaggingClassifier, 
    ExtraTreesClassifier, 
    RandomForestClassifier
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC, NuSVC
from sklearn.linear_model import (
    LogisticRegression, 
    SGDClassifier, 
    RidgeClassifier, 
    RidgeClassifierCV,
    PassiveAggressiveClassifier, 
    Perceptron
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Support Vector Machine
from sklearn.svm import LinearSVC
# LightGBM and XGBoost
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

In [5]:
#============Load Data================#
# Set Data Path
main_dir = "/Users/lvwei/Desktop/widsdatathon2025/"

# Load data
df_solu = pd.read_excel(f"{main_dir}TRAIN/dataset/TRAINING_SOLUTIONS.xlsx")
df_quan = pd.read_excel(f"{main_dir}TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")
df_conn = pd.read_csv(f"{main_dir}TRAIN/dataset/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")  
df_cate = pd.read_excel(f"{main_dir}TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
# Load Test Data
test_cate = pd.read_excel(f"{main_dir}TRAIN/dataset/TEST_CATEGORICAL.xlsx")
test_quan = pd.read_csv(f"{main_dir}TRAIN/dataset/Corrected_Filled_TEST_Data.csv")
test_conn = pd.read_csv(f"{main_dir}TRAIN/dataset/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [120]:
#=======Merge data============== Use only one dataset #
df_combined = pd.merge(df_solu, df_quan, on="participant_id")
test_combined = test_quan.copy()

In [6]:
#=========Merge data============ More than one set#
# Merge datasets on participant_id
df_combined = pd.merge(df_solu, df_quan, on="participant_id")
df_combined = pd.merge(df_combined, df_cate, on="participant_id")
df_combined = pd.merge(df_combined, df_conn, on="participant_id")

test_combined = pd.merge(test_cate, test_quan, on="participant_id")
test_combined = pd.merge(test_combined, test_conn, on="participant_id")

In [71]:
#========Dont need standard=========#
common_features = [col for col in df_combined.columns 
                  if col not in ["participant_id", "ADHD_Outcome", "Sex_F"]]
X = df_combined[common_features]
test_features = test_combined[common_features]  
y_all = df_combined[["ADHD_Outcome", "Sex_F"]]
X_train, X_val, y_train_all, y_val_all = train_test_split(
    X, y_all, test_size=0.1, random_state=42, stratify=y_all["ADHD_Outcome"]
)

# Then separate out the labels after the split
y_train_adhd = y_train_all["ADHD_Outcome"]
y_val_adhd   = y_val_all["ADHD_Outcome"]
y_train_sex_f = y_train_all["Sex_F"]
y_val_sex_f   = y_val_all["Sex_F"]

# erase NA 
imputer = SimpleImputer(strategy="median")

X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_val_imputed = pd.DataFrame(
    imputer.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)

test_imputed = pd.DataFrame(
    imputer.transform(test_features),
    columns=test_features.columns,
    index=test_features.index
)


X_train_real = X_train_imputed.copy()
X_val_real = X_val_imputed.copy()
test_real = test_imputed.copy()

In [11]:
#==========Standard============#
common_features = [col for col in df_combined.columns 
                  if col not in ["participant_id", "ADHD_Outcome", "Sex_F"]]
X = df_combined[common_features]
test_features = test_combined[common_features]  
y_all = df_combined[["ADHD_Outcome", "Sex_F"]]
X_train, X_val, y_train_all, y_val_all = train_test_split(
    X, y_all, test_size=0.001, random_state=42, stratify=y_all["ADHD_Outcome"]
)

# Then separate out the labels after the split
y_train_adhd = y_train_all["ADHD_Outcome"]
y_val_adhd   = y_val_all["ADHD_Outcome"]
y_train_sex_f = y_train_all["Sex_F"]
y_val_sex_f   = y_val_all["Sex_F"]

# erase NA 
imputer = SimpleImputer(strategy="mean")

X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_val_imputed = pd.DataFrame(
    imputer.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)

test_imputed = pd.DataFrame(
    imputer.transform(test_features),
    columns=test_features.columns,
    index=test_features.index
)


scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_imputed),
    columns=X_train_imputed.columns,
    index=X_train_imputed.index
)
X_val_scaled = pd.DataFrame(
    scaler.transform(X_val_imputed),
    columns=X_val_imputed.columns,
    index=X_val_imputed.index
)
test_scaled = pd.DataFrame(
    scaler.transform(test_imputed),
    columns=test_imputed.columns,
    index=test_imputed.index
)
X_train_real = X_train_imputed.copy()
X_val_real = X_val_imputed.copy()
test_real = test_imputed.copy()

In [12]:
#============Define Model Stacking Classifiers======#
adhd_models_standard = [
    ("calib",  CalibratedClassifierCV(estimator=LogisticRegression(), cv=3)), #0.69, 0.56
    ("knn",    KNeighborsClassifier()),#0.69, 0.56
    ("lda",    LinearDiscriminantAnalysis()),#0.69, 0.56
    ("lr",     LogisticRegression(max_iter=1000)),#0.68 #0.56
    ("nc",     NearestCentroid()), #0.67 0.58
    ("nusvc",  NuSVC()), #0.68 0.58
    ("ridge",  RidgeClassifier()), #0.67 0.59
    ("ridgecv",RidgeClassifierCV()),#0.67 0.59
    ("sgd",    SGDClassifier()),#0.67 0.59
    #("svc",    SVC(probability=False)),
    ("gnb",    GaussianNB()),  #0.68 0.65
]

adhd_models = [
    ("bag",    BaggingClassifier()), #0.64 0.62
    ("bnb",    BernoulliNB()), #0.67 0.58
    ("dummy",  DummyClassifier(strategy="most_frequent")), #0.69 0.56
    ("etc",    ExtraTreesClassifier()),#0.69 0.57
    ("lgb",    LGBMClassifier()), #0.68 0.56
    ("rf",     RandomForestClassifier()), #0.69 0.58
    ("xgb",    XGBClassifier(use_label_encoder=False, eval_metric="logloss")), #0.68 0.58

]

# =============================================================
# Sex Models
# =============================================================
sex_models_standard = [
    #("calib",  CalibratedClassifierCV(estimator=LogisticRegression(), cv=3)), #0.66 0.52 low F1 
    ("lr",     LogisticRegression(max_iter=1000)), # 0.72, 0.67
    ##("nc",     NearestCentroid()), #0.74 0.71 KEEP
    ("nusvc",  NuSVC()), # 0.66 0.52 low F1 
    ("ridge",  RidgeClassifier()), #0.73, 0.69
    ("ridgecv",RidgeClassifierCV()), #0.73, 0.69
    ("sgd",    SGDClassifier()), # 0.74, 0.71 KEEP
    ("svc",    SVC(probability=False)), # 0.66, 0.52 low F1 
    ("linear_svc",  LinearSVC(C=1.0, max_iter=1000)), #0.73, 0.73 KEEP
    ##("pa_classifier",  PassiveAggressiveClassifier(max_iter=1000, tol=1e-3)), #0.71, 0.72 KEEP
    ##("perceptron",  Perceptron(eta0=0.1, penalty='l2')), #0.70 0.73 KEEP
    
    #("label_prop",  LabelPropagation(kernel='rbf')),
    #("label_spread",  LabelSpreading(alpha=0.2)),
    #("lda",  LinearDiscriminantAnalysis()), #0.66 0.52 low F1
    #("gnb",    GaussianNB()), #0.7, 0.66
]

sex_models = [

    ("bnb",    BernoulliNB()), #0.7, 0.66
    ("dummy",  DummyClassifier(strategy="most_frequent")), #0.66, 0.52 low F1
    ("etc",    ExtraTreesClassifier()),# 0.58, 0.58 low F1
    ("lgb",    LGBMClassifier()), #0.67, 0.56
    ("rf",     RandomForestClassifier()), #0.65, 0.53
    ("xgb",    XGBClassifier(use_label_encoder=False, eval_metric="logloss")),  #0.67, 0.6

]
adhd_stack = StackingClassifier(
    estimators=adhd_models_standard,
    final_estimator = LogisticRegression(max_iter=1000),
    stack_method='auto'
)

sex_stack = StackingClassifier(
    estimators=sex_models_standard,
    final_estimator = LogisticRegression(max_iter=1000),
    stack_method='auto'
)


In [117]:
adhd_stack

In [62]:
sex_stack

In [13]:
#=========Train Models===========#
sample_weight = np.where(
    (y_train_adhd == 1) & (y_train_sex_f == 1),
    2.0, 1.0
)
#adhd_stack.fit(X_train_real, y_train_adhd)
sex_stack.fit(X_train_real, y_train_sex_f,sample_weight=sample_weight)

# Evaluate on Training Set


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
#========Make Predictions on Test Set=======#
#adhd_pred = adhd_stack.predict(test_real)
sex_pred = sex_stack.predict(test_real)

#print("Test predictions for ADHD:", adhd_pred)
print("Test predictions for gender:", sex_pred)

Test predictions for gender: [1 1 1 1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0
 1 1 1 1 0 1 1 0 0 1 0 0 1 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0
 0 0 1 1 1 1 0 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 1
 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1
 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 1 0
 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 1
 0 0 1 1 0 1 0 0]


In [38]:
#=======Create Submission File=====#
submission = pd.DataFrame({
    'participant_id': test_combined['participant_id'],
    #'ADHD_Outcome': adhd_pred,
    'Sex_F': sex_pred,
})
# Ensure correct data type
#submission["ADHD_Outcome"] = submission["ADHD_Outcome"].astype(int)
submission["Sex_F"] = submission["Sex_F"].astype(int)

# Save Submission

submission.to_csv(f"{main_dir}submission_try23.csv", index=False)
print("\nSubmission file saved successfully.")




Submission file saved successfully.
