In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
import datetime
import warnings

warnings.filterwarnings('ignore')


base_path = '/Users/qiyuqing/Desktop/widsdatathon2025/'
train_path = base_path + 'TRAIN_NEW/'
test_path = base_path + 'TEST/'

# Load training data
train_labels = pd.read_excel(train_path + 'TRAINING_SOLUTIONS.xlsx')
train_quant = pd.read_excel(train_path + 'TRAIN_QUANTITATIVE_METADATA_new.xlsx')
train_cat = pd.read_excel(train_path + 'TRAIN_CATEGORICAL_METADATA_new.xlsx')
train_fmri = pd.read_csv(train_path + 'TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')

# Merge training data
train = train_labels.merge(train_quant, on='participant_id')
train = train.merge(train_cat, on='participant_id')
train = train.merge(train_fmri, on='participant_id')

# Load test data
test_quant = pd.read_excel(test_path + 'TEST_QUANTITATIVE_METADATA.xlsx')
test_cat = pd.read_excel(test_path + 'TEST_CATEGORICAL.xlsx')
test_fmri = pd.read_csv(test_path + 'TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')

test = test_quant.merge(test_cat, on='participant_id')
test = test.merge(test_fmri, on='participant_id')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")



X = train.drop(columns=['ADHD_Outcome', 'Sex_F', 'participant_id'])
y = train[['ADHD_Outcome', 'Sex_F']]
X_test = test.drop(columns=['participant_id'])

# Fill missing values simply (mean for numeric)
X.fillna(X.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)


rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_split=3,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model = MultiOutputClassifier(rf)
model.fit(X, y)


pred = model.predict(X_test)
submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'ADHD_Outcome': pred[:, 0],
    'Sex_F': pred[:, 1]
})

# Save to Desktop with timestamp
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')
submission_path = f'/Users/qiyuqing/Desktop/submission_{timestamp}.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission file saved at: {submission_path}")



train_pred = model.predict(X)
f1_adhd = f1_score(y['ADHD_Outcome'], train_pred[:, 0])
f1_sex = f1_score(y['Sex_F'], train_pred[:, 1])
final_score = (2 * f1_adhd + f1_sex) / 3

print(f"\nTraining F1 Score (ADHD): {f1_adhd:.4f}")
print(f"Training F1 Score (Sex): {f1_sex:.4f}")
print(f"Weighted Final Score: {final_score:.4f}")



adhd_rate = submission['ADHD_Outcome'].mean()
sex_f_rate = submission['Sex_F'].mean()

print(f"\nTest Prediction ADHD_Outcome Rate: {adhd_rate:.2%}")
print(f"Test Prediction Sex_F Rate: {sex_f_rate:.2%}")


Train shape: (1213, 19930), Test shape: (304, 19928)

Submission file saved at: /Users/qiyuqing/Desktop/submission_20250330_1305.csv

Training F1 Score (ADHD): 1.0000
Training F1 Score (Sex): 1.0000
Weighted Final Score: 1.0000

Test Prediction ADHD_Outcome Rate: 98.68%
Test Prediction Sex_F Rate: 0.66%
