In [1]:
import random
import torch 
import os
import numpy as np
RANDOM_STATE_1 = 1998
RANDOM_STATE_2 = 42
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(RANDOM_STATE_1)

Random seed set as 1998


# Part 1: preprocessing data

## Read data

In [2]:
import pandas as pd
train_data_dir = "/kaggle/input/widsdatathon2025/TRAIN_NEW"
train_solutions = pd.read_excel(train_data_dir + "/TRAINING_SOLUTIONS.xlsx")
train_categorical = pd.read_excel(train_data_dir + "/TRAIN_CATEGORICAL_METADATA_new.xlsx")
train_functional = pd.read_csv(train_data_dir + \
                               "/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
train_metadata = pd.read_excel(train_data_dir + "/TRAIN_QUANTITATIVE_METADATA_new.xlsx")

test_data_dir = "/kaggle/input/widsdatathon2025/TEST"
test_functional = pd.read_csv(test_data_dir + \
                               "/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [3]:
print("train_solutions:", train_solutions.shape, 
      "\n train_solutions key", train_solutions.keys())
print("train_functional:", train_functional.shape,
     "\n train_functional key", train_functional.keys())
print("train_metadata:", train_metadata.shape,
     "\n train_metadata", train_metadata.keys())
print("train_categorical:", train_categorical.shape,
     "\n train_categorical", train_categorical.keys())

train_solutions: (1213, 3) 
 train_solutions key Index(['participant_id', 'ADHD_Outcome', 'Sex_F'], dtype='object')
train_functional: (1213, 19901) 
 train_functional key Index(['participant_id', '0throw_1thcolumn', '0throw_2thcolumn',
       '0throw_3thcolumn', '0throw_4thcolumn', '0throw_5thcolumn',
       '0throw_6thcolumn', '0throw_7thcolumn', '0throw_8thcolumn',
       '0throw_9thcolumn',
       ...
       '195throw_196thcolumn', '195throw_197thcolumn', '195throw_198thcolumn',
       '195throw_199thcolumn', '196throw_197thcolumn', '196throw_198thcolumn',
       '196throw_199thcolumn', '197throw_198thcolumn', '197throw_199thcolumn',
       '198throw_199thcolumn'],
      dtype='object', length=19901)
train_metadata: (1213, 19) 
 train_metadata Index(['participant_id', 'EHQ_EHQ_Total', 'ColorVision_CV_Score',
       'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
       'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficult

## Construct cor variance matrix

In [4]:
!pip install -q torch_geometric
!pip install -q geomstats

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25h

In [5]:
# n_regions = 200
# n_samples = train_functional.shape[0]
# adj_mat = np.zeros((200,200))

# start_idx = 0

# def construct_corvar(df_row, n_regions):
#     adj = np.ones((n_regions, n_regions))
#     num_adj = n_regions - 1
#     start_idx = 0
#     for j in range(n_regions):
#         for i in range(n_regions - num_adj, num_adj):
#             adj[i][j] = df_row.iloc[start_idx + i]
#         num_adj -= 1
#         start_idx += num_adj
#     return adj 

# train_data = []

# for i in n_samples:
#     adj = construct_corvar(train_functional.iloc[i])
#     train_data.append(adj)

## Construct symmetric matrices

In [6]:
import geomstats.datasets.utils as data_utils
import geomstats.backend as gs
from geomstats.geometry.skew_symmetric_matrices import SkewSymmetricMatrices

def load_connectomes(df_conn, df_soln_adhd=train_solutions, as_vectors=False, test=False):
        patient_id = gs.array(df_conn["participant_id"])
        data = gs.array(df_conn.drop('participant_id', axis=1))
        targets = gs.array(df_soln_adhd[['ADHD_Outcome', 'Sex_F']])
        if test==True:
            return mat, patient_id
        if as_vectors:
            target_ADHD = targets[:,0]
            target_sex = targets[:,1]
            return data, patient_id, target_ADHD, target_sex

        mat = SkewSymmetricMatrices(200).matrix_representation(data)
        mat = gs.eye(200) - gs.transpose(gs.tril(mat), (0,2,1))
        mat = 1.0/2.0 * (mat + gs.transpose(mat, (0,2,1)))
        return mat, patient_id, targets

In [7]:
data, patient_id, labels = load_connectomes(train_functional, train_solutions)

from geomstats.geometry.spd_matrices import SPDMatrices
manifold = SPDMatrices(200, equip=False)
print(gs.all(manifold.belongs(data)))
count_false = np.sum(~(manifold.belongs(data)))
print("Count of False:", count_false)

True
Count of False: 0


## Ensuring SPD Property

In [8]:
def add_diagonal_correction(matrix):
    eigenvalues = np.linalg.eigvals(matrix)
    min_eigenvalue = np.min(eigenvalues)

    if min_eigenvalue < 0:
        correction = -min_eigenvalue + 1e-6
        correction_matrix = correction * np.eye(matrix.shape[0])
        return matrix + correction_matrix
    else:
        return matrix

data_corrected = np.array([add_diagonal_correction(slice) for slice in data])
print("Original Matrix shape:", data.shape)
print("Corrected Matrix shape:", data_corrected.shape)
print(gs.all(manifold.belongs(data_corrected)))

Original Matrix shape: (1213, 200, 200)
Corrected Matrix shape: (1213, 200, 200)
True


# Part 2: GCN

## Data preparation

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

connectivity_matrices = torch.tensor(data_corrected).float()
labels = torch.tensor(labels).float()

data_list = []
for i in range(len(connectivity_matrices)):
    matrix = connectivity_matrices[i]
    edge_index = (matrix > 0).nonzero(as_tuple=False).t()
    edge_attr = matrix[edge_index[0], edge_index[1]]
    x = torch.eye(200)

    graph_data = Data(x=x, edge_index=edge_index,
                     edge_attr=edge_attr, y=labels[i].unsqueeze(0))

    data_list.append(graph_data)

train_data_list = data_list

## Define models

In [10]:
class GCN(torch.nn.Module):
    def __init__(self, num_layers):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels=200,out_channels=128)
        self.in_channels = 128
        self.num_layers = num_layers
        self.layers = nn.ModuleList()
        hidden_dim = self.in_channels
        for i in range(num_layers):
            if (i == num_layers-1):
                self.layers.append(GCNConv(in_channels=hidden_dim, out_channels=2))
            else:
                out_dim = max(hidden_dim // 2, 1)
                self.layers.append(GCNConv(in_channels=hidden_dim, out_channels=out_dim))
                hidden_dim = out_dim   
            
    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        for layer in self.layers:
            x = layer(x, edge_index, edge_attr)
            x = F.relu(x)

        x = global_mean_pool(x, batch)
        return x

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## Hyperparameters and training 

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tuning = False
n_trials = 1

In [13]:
!pip install -q iterative-stratification
!pip install -q optuna

In [19]:
from sklearn.metrics import f1_score

def evaluate_model(model, data_loader, device, log=False):
    model.eval()
    actual_adhd, pred_adhd = [], []
    actual_sex, pred_sex = [], []

    with torch.no_grad():
        for batch_data in data_loader:
            batch_data = batch_data.to(device)
            out = model(batch_data)
            preds = torch.round(torch.sigmoid(out))

            actual_adhd.extend(batch_data.y[:,0].int().cpu().tolist())
            pred_adhd.extend(preds[:,0].int().cpu().tolist())
            actual_sex.extend(batch_data.y[:,1].int().cpu().tolist())
            pred_sex.extend(preds[:,1].int().cpu().tolist())

    f1_adhd = f1_score(actual_adhd, pred_adhd)
    f1_sex = f1_score(actual_sex, pred_sex)
    avg_f1 = (f1_adhd + f1_sex) / 2.0
    if log:
        print(f"F1 ADHD: {f1_adhd:.4f}, F1 Sex: {f1_sex:.4f}, Avg F1: {avg_f1:.4f}")
    return f1_adhd, f1_sex, avg_f1

def evaluate_model_debug(model, data_loader, device, log=False):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch_data in data_loader:
            batch_data = batch_data.to(device)
            out = model(batch_data)
            preds = torch.sigmoid(out)
            all_preds.append(preds.cpu())
    all_preds = torch.cat(all_preds, dim=0)
    if log:
        print("Raw predictions (first 10):", all_preds[:1])
    return all_preds

In [40]:
def training(model, optimizer, criterion, train_loader, val_loader, device, num_epochs=200, patience=50):
    losses = []
    best_val_score = 0
    epochs_no_improve = 0
    softmax = torch.nn.Softmax(dim=1)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = model(data)
            out = softmax(out)
            loss_ADHD = criterion(out[:,0], data.y[:,0])
            loss_sex = criterion(out[:,1], data.y[:,1])
            loss = loss_ADHD + loss_sex
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        losses.append(avg_loss)
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")
            # print(f"Evaluating train in epoch {epoch}...")
            # f1_adhd_tr, f1_sex_tr, avg_f1_tr = evaluate_model(model, train_loader, device, log=True)
            print(f"Evaluating val in epoch {epoch}...")
            f1_adhd_val, f1_sex_val, avg_f1_val = evaluate_model(model, val_loader, device, log=True)
        else: 
            f1_adhd_val, f1_sex_val, avg_f1_val = evaluate_model(model, val_loader, device)
        
        if avg_f1_val > best_val_score:
            best_val_score = avg_f1_val
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered at epoch {epoch}")
            break

    return model, f1_adhd_val, f1_sex_val, avg_f1_val

In [16]:
import optuna
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def objective(trial):
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-1)
    num_layers = trial.suggest_int("num_layers", 1,4)
    batch_size = trial.suggest_categorical("batch_size", [16,32,64])
    num_epochs = trial.suggest_int("num_epochs", 50, 200)
    patience = trial.suggest_int("patience",10,50)
    labels_np = labels.cpu().numpy()
    indices = np.arange(len(train_data_list))

    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for train_idx, val_idx in mskf.split(indices, labels_np):
        train_fold = [train_data_list[i] for i in train_idx]
        val_fold = [train_data_list[i] for i in val_idx]

        train_loader = DataLoader(train_fold, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_fold, batch_size=batch_size, shuffle=True)
        model = GCN(num_layers=num_layers).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = torch.nn.BCEWithLogitsLoss()
        model, f1_adhd_val, f1_sex_val, avg_f1_val = training(model, optimizer, criterion, train_loader, val_loader, \
                                                              device, num_epochs=num_epochs, patience=patience)
        fold_scores.append(avg_f1_val)
    return np.mean(fold_scores)       

In [42]:
if tuning:
    best_params = study.best_trial.params
    lr = best_params["lr"]
    num_layers = best_params["num_layers"]
    batch_size = best_params["batch_size"]
    num_epochs = best_params["num_epochs"]
    patience = best_params["patience"]

else: 
    lr = 0.01
    num_layers = 4
    num_epochs = 300
    batch_size = 32
    patience = 100

In [43]:
n_splits=5
mskf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_scores = []
labels_np = labels.cpu().numpy()
indices = np.arange(len(train_data_list))

for train_idx, val_idx in mskf.split(indices, labels_np):
    train_fold = [train_data_list[i] for i in train_idx]
    val_fold = [train_data_list[i] for i in val_idx]

    train_loader = DataLoader(train_fold, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_fold, batch_size=batch_size, shuffle=True)
    model = GCN(num_layers=num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    model, f1_adhd_val, f1_sex_val, avg_f1_val = training(model, optimizer, criterion, train_loader, val_loader, \
                                                              device, num_epochs=num_epochs, patience=patience)
    fold_scores.append(avg_f1_val)
    
print(f"Avg f1-score between {n_splits} folds", np.mean(fold_scores))

Epoch 0, Loss: 111.2305
Evaluating val in epoch 0...
F1 ADHD: 0.8137, F1 Sex: 0.5108, Avg F1: 0.6622
Epoch 10, Loss: 111.1954
Evaluating val in epoch 10...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 20, Loss: 111.2988
Evaluating val in epoch 20...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 30, Loss: 111.0232
Evaluating val in epoch 30...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 40, Loss: 111.2299
Evaluating val in epoch 40...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 50, Loss: 111.1610
Evaluating val in epoch 50...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 60, Loss: 111.1265
Evaluating val in epoch 60...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 70, Loss: 111.1610
Evaluating val in epoch 70...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 80, Loss: 111.1265
Evaluating val in epoch 80...
F1 ADHD: 0.8137, F1 Sex: 0.0000, Avg F1: 0.4069
Epoch 90, Loss: 111.1610
Evaluating val in epoch 90...
F1 ADHD: 0.8137, F1 

KeyboardInterrupt: 

## Performance on train

In [None]:
# from sklearn.metrics import accuracy_score, \
#                             f1_score

# actual_labels_ADHD = []
# predicted_labels_ADHD = []
# actual_labels_sex = []
# predicted_labels_sex = []

# model.eval()
# with torch.no_grad():
#     for batch_data in train_loader:
#         batch_data = batch_data.to(device)
#         out = model(batch_data)
#         # Apply sigmoid and round for binary predictions
#         pred = torch.round(torch.sigmoid(out))
#         # Assume batch_data.y has shape [batch_size, 2]; extract each target column
#         actual_labels_ADHD.extend(batch_data.y[:, 0].int().tolist())
#         predicted_labels_ADHD.extend(pred[:, 0].int().tolist())
#         actual_labels_sex.extend(batch_data.y[:, 1].int().tolist())
#         predicted_labels_sex.extend(pred[:, 1].int().tolist())

# accuracy_ADHD = accuracy_score(actual_labels_ADHD, predicted_labels_ADHD)
# accuracy_sex = accuracy_sex(actual_labels_sex, predicted_labels_sex)
# accuracy = (accuracy_ADHD + accuracy_sex)/2

# f1_ADHD = f1_score(actual_labels_ADHD, predicted_labels_ADHD)
# f1_sex = f1_sex(actual_labels_sex, predicted_labels_sex)
# f1 = (f1_ADHD + f1_sex)/2

## Prepare test set for inference

In [None]:
data_test, patient_id_test = load_connectomes(test_functional, test=True)
data_corrected_test = np.array([add_diagonal_correction(slice) for slice in data_test])
connectivity_matrices_test = torch.tensor(data_corrected_test).float()

data_list_test = []
for i in range(len(connectivity_matrices_test)):
    matrix = connectivity_matrices_test[i]
    edge_index = (matrix > 0).nonzero(as_tuple=False).t()
    edge_attr = matrix[edge_index[0], edge_index[1]]
    x = torch.eye(200)

    dummy_label = torch.zeros(2)

    graph_data = Data(x=x, edge_index=edge_index,
                     edge_attr=edge_attr, y=dummy_label)

    data_list_test.append(graph_data)

test_data = data_list_test
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
model.eval()
predicted_labels_ADHD = []
predicted_labels_sex = []
with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        out = model(data)
        pred = torch.round(torch.sigmoid(out))
        predicted_labels_ADHD.extend(pred[:,0].cpu().numpy().tolist())
        predicted_labels_sex.extend(pred[1].cpu().numpy().tolist())

import pandas as pd
result1 = pd.DataFrame({
    'participant_id': patient_id_test,
    'ADHD_Outcome': predicted_labels_ADHD,
    'Sex_F': predicted_labels_sex
})
# result.to_csv("kaggle/working/submission.csv", index=False)
# result

# Part 2: Ensemble voting

## Loading features

In [None]:
def get_feats(data_dir, mode="TRAIN"):
    meta = pd.read_excel(os.path.join(data_dir, f"{mode}_QUANTITATIVE_METADATA.xlsx")
    cate = pd.read_excel(os.path.join(data_dir, f"{mode}_CATEGORICAL_METADATA.xlsx"))
    func = pd.read_csv(os.path.join(data_dir, f"{mode}_FUNCTIONAL_CONNECTOME_MATRICES.csv"))
    feats = feats.merge(cate, on='participant_id', how='left')
    feats = feats.merge(func, on='participant_id', how='left')
    if mode == 'TRAIN':
        solution = pd.read_excel(os.path.join(data_dir, "TRAINING_SOLUTIONS.xlsx"))
        feats = feats.merge(solution, on='participant_id', how='left')
    return feat

train = get_feats(mode='TRAIN')
test = get_feats(mode='TEST')

In [None]:
def check_for_nulls(df):
    if df.isnull().any().any():
        print("The DataFrame contains null values.")
    else:
        print("The DataFrame does not contain null values.")

check_for_nulls(train)
check_for_nulls(test)
print(f'Train: {train.shape}, Test: {test.shape}')

In [None]:
train.set_index('participant_id',inplace=True)
test.set_index('participant_id',inplace=True)
targets = train[['ADHD_Outcome','Sex_F']]
features = test.columns
log_features = [f for f in features if (train[f] >= 0).all() and scipy.stats.skew(train[f]) > 0]

## Handling missing values

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import RFECV, SequentialFeatureSelector, SelectKBest

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import ExtraTreesRegressor, RandomForestClassifier

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
indices = np.arange(len(train))
targets_np = np.array(targets)
for train_idx, val_idx in mskf.split(indices, targets_np):
    train_fold = [train[i] for i in train_idx]
    train_target = [target_np[i] for i in train_idx]
    val_fold = [train[i] for i in val_idx]
    val_target = [target_np[i] for i in val_idx]

    model = MultiOutputClassifier(make_pipeline(
                        
                              ColumnTransformer([('imputer',SimpleImputer(),features)],
                                               remainder='passthrough',
                                               verbose_feature_names_out=False).set_output(transform='pandas'),
                              ColumnTransformer([('log', 
                                                 FunctionTransformer(np.log1p), log_features)],
                                                 remainder='passthrough'),
                              
                            MinMaxScaler(),    
                              
                            RidgeClassifier(alpha=100)))
    model.fit(train_fold, train_target)
    val_pred = model.predict(val_fold)
    fold_scores = f1_score(val_pred, val_fold)
    fold_scores.append(avg_f1_val)

print("Validation fold scores", np.mean(fold_scores))

In [None]:
model = MultiOutputClassifier(make_pipeline(ColumnTransformer([('imputer',SimpleImputer(),features)],
                                               remainder='passthrough',
                                               verbose_feature_names_out=False).set_output(transform='pandas'),
                                              ColumnTransformer([('log', 
                                                 FunctionTransformer(np.log1p), log_features)],
                                                 remainder='passthrough'),
                                            MinMaxScaler(),  
                                            PCA(1087),
                                            RidgeClassifier(alpha=100)))
model.fit(train.drop(targets,axis=1),
          targets)
y_pred = model.predict(test)
sub['ADHD_Outcome'] = y_pred[:,0]
sub['Sex_F'] = y_pred[:,1]
sub.to_csv('submission.csv',index=False)