In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Load Train Data**

In [None]:
df_categorical = pd.read_excel('/content/drive/MyDrive/WiDS_TeamConscious/data/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx')
df_functional = pd.read_csv('/content/drive/MyDrive/WiDS_TeamConscious/data/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
df_quantitative = pd.read_excel('/content/drive/MyDrive/WiDS_TeamConscious/data/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
df_target = pd.read_excel('/content/drive/MyDrive/WiDS_TeamConscious/data/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')

**Train Data Preparation**
*   Fill Null Values
*   One-Hot-Encode
*   Merge Training Data

In [43]:
print("Null values in df_categorical:")
print(df_categorical.isnull().sum())
print("\nNull values in df_functional:")
print(df_functional.isnull().sum())
print("\nNull values in df_quantitative:")
print(df_quantitative.isnull().sum())
print("\nNull values in df_target:")
print(df_target.isnull().sum())

Null values in df_categorical:
participant_id                        0
PreInt_Demos_Fam_Child_Race           0
MRI_Track_Scan_Location               0
Barratt_Barratt_P1_Edu                0
Barratt_Barratt_P1_Occ                0
Barratt_Barratt_P2_Edu                0
Barratt_Barratt_P2_Occ                0
Basic_Demos_Enroll_Year_2016          0
Basic_Demos_Enroll_Year_2017          0
Basic_Demos_Enroll_Year_2018          0
Basic_Demos_Enroll_Year_2019          0
Basic_Demos_Enroll_Year_2020          0
Basic_Demos_Study_Site_2              0
Basic_Demos_Study_Site_3              0
Basic_Demos_Study_Site_4              0
PreInt_Demos_Fam_Child_Ethnicity_1    0
PreInt_Demos_Fam_Child_Ethnicity_2    0
PreInt_Demos_Fam_Child_Ethnicity_3    0
dtype: int64

Null values in df_functional:
participant_id          0
0throw_1thcolumn        0
0throw_2thcolumn        0
0throw_3thcolumn        0
0throw_4thcolumn        0
                       ..
196throw_198thcolumn    0
196throw_199thcolumn   

In [44]:
for column in df_quantitative.columns:
  if df_quantitative[column].isnull().any():
    mean_value = df_quantitative[column].mean()
    df_quantitative[column].fillna(mean_value, inplace=True)

for column in df_categorical.columns:
  if df_categorical[column].isnull().any():
    mean_value = df_categorical[column].mean()
    df_categorical[column].fillna(mean_value, inplace=True)

In [45]:
df_categorical.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3
0,00aIpNTbG5uh,0.0,3.0,21.0,45.0,16.876847,30.257316,False,False,False,True,False,False,False,True,True,False,False
1,00fV0OyyoLfw,9.0,2.0,21.0,0.0,21.0,45.0,False,True,False,False,False,False,False,False,False,False,False
2,04X1eiS79T4B,2.0,2.0,9.0,0.0,16.876847,30.257316,False,True,False,False,False,False,False,False,True,False,False
3,05ocQutkURd6,8.0,2.0,18.0,10.0,18.0,0.0,False,False,True,False,False,False,False,False,False,False,True
4,06YUNBA9ZRLq,1.0,2.0,12.0,0.0,16.876847,30.257316,False,False,True,False,False,False,False,False,False,False,False


In [None]:
categorical_int_cols = df_categorical.select_dtypes(include=['int']).columns
df_categorical = pd.get_dummies(df_categorical, columns=categorical_int_cols, drop_first=True)

df_categorical.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3
0,00aIpNTbG5uh,0.0,3.0,21.0,45.0,16.876847,30.257316,False,False,False,True,False,False,False,True,True,False,False
1,00fV0OyyoLfw,9.0,2.0,21.0,0.0,21.0,45.0,False,True,False,False,False,False,False,False,False,False,False
2,04X1eiS79T4B,2.0,2.0,9.0,0.0,16.876847,30.257316,False,True,False,False,False,False,False,False,True,False,False
3,05ocQutkURd6,8.0,2.0,18.0,10.0,18.0,0.0,False,False,True,False,False,False,False,False,False,False,True
4,06YUNBA9ZRLq,1.0,2.0,12.0,0.0,16.876847,30.257316,False,False,True,False,False,False,False,False,False,False,False


In [None]:
dfs = [df_categorical, df_functional, df_quantitative, df_target]

for df in dfs:
  df.sort_values(by=df.columns.tolist(), inplace=True)
  print(df.head())

  participant_id  PreInt_Demos_Fam_Child_Race  MRI_Track_Scan_Location  \
0   00aIpNTbG5uh                          0.0                      3.0   
1   00fV0OyyoLfw                          9.0                      2.0   
2   04X1eiS79T4B                          2.0                      2.0   
3   05ocQutkURd6                          8.0                      2.0   
4   06YUNBA9ZRLq                          1.0                      2.0   

   Barratt_Barratt_P1_Edu  Barratt_Barratt_P1_Occ  Barratt_Barratt_P2_Edu  \
0                    21.0                    45.0               16.876847   
1                    21.0                     0.0               21.000000   
2                     9.0                     0.0               16.876847   
3                    18.0                    10.0               18.000000   
4                    12.0                     0.0               16.876847   

   Barratt_Barratt_P2_Occ  Basic_Demos_Enroll_Year_2016  \
0               30.257316        

In [None]:
merged_df = df_categorical.merge(df_functional, on='participant_id', how='inner').merge(df_quantitative, on='participant_id', how='inner')
merged_df

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,00aIpNTbG5uh,0.0,3.0,21.0,45.000000,16.876847,30.257316,False,False,False,...,3.0,17.0,4.0,11.0,5.0,8.0,6.0,2.0,9.0,14.274127
1,00fV0OyyoLfw,9.0,2.0,21.0,0.000000,21.000000,45.000000,False,True,False,...,5.0,20.0,4.0,13.0,5.0,8.0,7.0,3.0,8.0,11.245678
2,04X1eiS79T4B,2.0,2.0,9.0,0.000000,16.876847,30.257316,False,True,False,...,3.0,24.0,7.0,10.0,10.0,7.0,14.0,7.0,7.0,13.463381
3,05ocQutkURd6,8.0,2.0,18.0,10.000000,18.000000,0.000000,False,False,True,...,0.0,5.0,0.0,3.0,0.0,3.0,2.0,2.0,6.0,9.572553
4,06YUNBA9ZRLq,1.0,2.0,12.0,0.000000,16.876847,30.257316,False,False,True,...,6.0,23.0,7.0,15.0,8.0,9.0,8.0,1.0,4.0,6.654574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208,zpU7rEseBMH8,3.0,2.0,12.0,20.000000,21.000000,45.000000,False,True,False,...,0.0,8.0,0.0,6.0,6.0,6.0,2.0,2.0,8.0,12.669860
1209,zpr8w4jCfVPe,2.0,3.0,12.0,5.000000,21.000000,35.000000,False,False,False,...,1.0,15.0,3.0,8.0,3.0,7.0,7.0,4.0,7.0,11.245678
1210,zwBG0rZ05Mcb,0.0,2.0,15.0,35.000000,18.000000,35.000000,True,False,False,...,0.0,10.0,3.0,5.0,3.0,5.0,5.0,2.0,8.0,8.155258
1211,zwXD5v17Rx01,0.0,3.0,21.0,40.000000,21.000000,40.000000,False,False,True,...,2.0,5.0,0.0,5.0,0.0,3.0,0.0,0.0,7.0,7.364020


**Test Data Preparation**
*   Fill Null Values
*   One-Hot-Encode
*   Merge Testing Data

In [None]:
test_quantitative = pd.read_excel('/content/drive/MyDrive/WiDS_TeamConscious/data/TEST/TEST_QUANTITATIVE_METADATA.xlsx')
test_functional = pd.read_csv('/content/drive/MyDrive/WiDS_TeamConscious/data/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_categorical = pd.read_excel('/content/drive/MyDrive/WiDS_TeamConscious/data/TEST/TEST_CATEGORICAL.xlsx')

print(test_quantitative.head())
print(test_functional.head())
print(test_categorical.head())

  participant_id  EHQ_EHQ_Total  ColorVision_CV_Score  APQ_P_APQ_P_CP  \
0   Cfwaf5FX7jWK          60.03                  14.0             5.0   
1   vhGrzmvA3Hjq          86.71                  12.0             3.0   
2   ULliyEXjy4OV          26.68                  13.0             3.0   
3   LZfeAb1xMtql          93.38                  13.0             3.0   
4   EnFOUv0YK1RG         -93.38                  14.0             3.0   

   APQ_P_APQ_P_ID  APQ_P_APQ_P_INV  APQ_P_APQ_P_OPD  APQ_P_APQ_P_PM  \
0            16.0             41.0             19.0            11.0   
1            13.0             43.0             18.0            15.0   
2            14.0             36.0             16.0            14.0   
3            19.0             41.0             17.0            18.0   
4            13.0             42.0             19.0            16.0   

   APQ_P_APQ_P_PP  SDQ_SDQ_Conduct_Problems  SDQ_SDQ_Difficulties_Total  \
0            26.0                       2.0                

In [None]:
for col in test_quantitative.columns:
    if col != 'participant_id':
        mean_val = test_quantitative[col].mean()
        test_quantitative[col].fillna(mean_val, inplace=True)

test_quantitative.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,60.03,14.0,5.0,16.0,41.0,19.0,11.0,26.0,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,86.71,12.0,3.0,13.0,43.0,18.0,15.0,28.0,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,26.68,13.0,3.0,14.0,36.0,16.0,14.0,25.0,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,93.38,13.0,3.0,19.0,41.0,17.0,18.0,27.0,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,-93.38,14.0,3.0,13.0,42.0,19.0,16.0,28.0,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [None]:
for col in test_categorical.columns:
    if col != 'participant_id':
        if pd.api.types.is_numeric_dtype(test_categorical[col]):
            test_categorical[col].fillna(test_categorical[col].mean(), inplace=True)
        else:
            test_categorical[col].fillna(test_categorical[col].mode()[0], inplace=True)

for col in test_categorical.columns:
    if col != 'participant_id':
        test_categorical[col] = test_categorical[col].astype('int')

test_categorical = pd.get_dummies(test_categorical, columns=[col for col in test_categorical.columns if col != 'participant_id'], drop_first=True)

test_categorical.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2020,Basic_Demos_Enroll_Year_2021,Basic_Demos_Enroll_Year_2022,Basic_Demos_Enroll_Year_2023,Basic_Demos_Study_Site_5,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Race_1,...,Barratt_Barratt_P2_Edu_21,Barratt_Barratt_P2_Occ_5,Barratt_Barratt_P2_Occ_10,Barratt_Barratt_P2_Occ_15,Barratt_Barratt_P2_Occ_20,Barratt_Barratt_P2_Occ_25,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45
0,Cfwaf5FX7jWK,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,vhGrzmvA3Hjq,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,ULliyEXjy4OV,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,LZfeAb1xMtql,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
4,EnFOUv0YK1RG,False,False,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,True


In [None]:
test_dfs = [test_categorical, test_functional, test_quantitative]
test_merged_df = test_categorical.merge(test_functional, on='participant_id', how='inner').merge(test_quantitative, on='participant_id', how='inner')
test_merged_df.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2020,Basic_Demos_Enroll_Year_2021,Basic_Demos_Enroll_Year_2022,Basic_Demos_Enroll_Year_2023,Basic_Demos_Study_Site_5,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Race_1,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,False,False,True,False,False,False,False,False,False,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,False,False,False,True,False,False,False,False,False,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,False,False,True,False,False,False,False,False,False,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,False,False,True,False,False,False,False,False,False,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,False,False,True,False,False,False,True,False,False,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [None]:
test_merged_df.sort_values(by=test_merged_df.columns.tolist(), inplace=True, ignore_index=True)
test_merged_df.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2020,Basic_Demos_Enroll_Year_2021,Basic_Demos_Enroll_Year_2022,Basic_Demos_Enroll_Year_2023,Basic_Demos_Study_Site_5,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Race_1,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,06HFIpqKfXy9,False,False,True,False,False,False,False,False,False,...,1.0,6.0,0.0,4.0,0.0,3.0,2.0,2.0,10.0,12.66929
1,0Dua0TUw4sNZ,False,False,True,False,False,False,False,False,False,...,2.0,7.0,0.0,7.0,1.0,5.0,0.0,0.0,9.0,8.329796
2,0ImS6uhE3Ie9,False,False,True,False,False,False,False,False,False,...,1.0,4.0,1.0,2.0,3.0,1.0,2.0,1.0,10.0,15.270134
3,0VHL9SCh2TfC,False,False,True,False,False,False,False,False,False,...,1.0,6.0,0.0,6.0,2.0,5.0,0.0,0.0,6.0,10.755418
4,0X2H4LroxZcw,False,False,True,False,False,False,False,False,False,...,6.0,22.0,4.0,11.0,10.0,5.0,11.0,7.0,1.0,9.517796


# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain

Tried the models above using the total merged training data, F1 scores were consistently .3-.5s

In [None]:
X = merged_df.drop(columns = ['participant_id'])
y = df_target.drop(columns = ['participant_id'])
train_id = merged_df['participant_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [None]:
test_id = test_merged_df['participant_id']

final_test = test_merged_df.drop(columns = 'participant_id')

**Logistic Modeling Base Case**

In [None]:
#MultiOutcome
base_model = LogisticRegression()

multioutput_classifier = MultiOutputClassifier(base_model)

multioutput_classifier.fit(X_train, y_train)

y_pred_mo = multioutput_classifier.predict(X_test)

predictions_df2 = pd.DataFrame(
    y_pred_mo,
    columns=['ADHD_Outcome', 'Sex_F']
)

mo_ids = merged_df['participant_id'].iloc[X_test.index].reset_index(drop=True)
result_mo = pd.concat([mo_ids, predictions_df2], axis=1)

result_mo.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,S42bdye8NcIE,0,0
1,MkC90gNpKWfP,1,0
2,poOPe7ceWIea,1,0
3,RWLTAz0qt46H,1,0
4,8jjvOdQ5WL1z,1,1


In [None]:
result_mo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   participant_id  364 non-null    object
 1   ADHD_Outcome    364 non-null    int64 
 2   Sex_F           364 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 8.7+ KB


In [None]:
#ClassifierChain
order = [y_train.columns.get_loc('Sex_F'), y_train.columns.get_loc('ADHD_Outcome')]
chain_model = ClassifierChain(LogisticRegression(), order=order)
chain_model.fit(X_train, y_train)
y_pred_chain = chain_model.predict(X_test)

predictions_df3 = pd.DataFrame(
    y_pred_chain,
    columns=['ADHD_Outcome', 'Sex_F']
)

cc_ids = merged_df['participant_id'].iloc[X_test.index].reset_index(drop=True)
result_cc = pd.concat([cc_ids, predictions_df3], axis=1)

for col in result_cc.columns:
    if col != 'participant_id':
        result_cc[col] = pd.to_numeric(result_cc[col], errors='coerce').astype('int')

result_cc.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,S42bdye8NcIE,0,0
1,MkC90gNpKWfP,1,0
2,poOPe7ceWIea,1,0
3,RWLTAz0qt46H,1,0
4,8jjvOdQ5WL1z,1,1


In [None]:
results = [result_mo, result_cc]
result_names = ['MultiOutput Classifier', 'Classifier Chain']

for i, result_df in enumerate(results):
    print(f"Evaluation for {result_names[i]}:")
    for col in ['ADHD_Outcome', 'Sex_F']:
        y_true = y_test[col]
        y_pred = result_df[col]
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        print(f"  {col}:")
        print(f"    Accuracy: {accuracy:.3f}")
        print(f"    Precision: {precision:.3f}")
        print(f"    Recall: {recall:.3f}")
        print(f"    F1 Score: {f1:.3f}")
    print('\n')

Evaluation for MultiOutput Classifier:
  ADHD_Outcome:
    Accuracy: 0.769
    Precision: 0.807
    Recall: 0.871
    F1 Score: 0.838
  Sex_F:
    Accuracy: 0.709
    Precision: 0.562
    Recall: 0.458
    F1 Score: 0.505


Evaluation for Classifier Chain:
  ADHD_Outcome:
    Accuracy: 0.772
    Precision: 0.812
    Recall: 0.867
    F1 Score: 0.839
  Sex_F:
    Accuracy: 0.709
    Precision: 0.562
    Recall: 0.458
    F1 Score: 0.505




In [None]:
result_cc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   participant_id  364 non-null    object
 1   ADHD_Outcome    364 non-null    int64 
 2   Sex_F           364 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 8.7+ KB


**Logistic Modeling w/ 'Best Parameters'**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
adhd_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    penalty='l2',
    solver='lbfgs',
    random_state=25
)
adhd_model.fit(X_train, y_train['ADHD_Outcome'])

In [None]:
sex_model = LogisticRegression(
    class_weight='balanced',
    max_iter=100,
    penalty='l2',
    solver='liblinear',
    random_state=25
)
sex_model.fit(X_train_scaled, y_train['Sex_F'])

In [None]:
y_pred_adhd = adhd_model.predict(X_test)
y_pred_sex = sex_model.predict(X_test_scaled)

print("ADHD Model Evaluation:")
print(f"  Accuracy: {accuracy_score(y_test['ADHD_Outcome'], y_pred_adhd):.3f}")
print(f"  Precision: {precision_score(y_test['ADHD_Outcome'], y_pred_adhd):.3f}")
print(f"  Recall: {recall_score(y_test['ADHD_Outcome'], y_pred_adhd):.3f}")
print(f"  F1 Score: {f1_score(y_test['ADHD_Outcome'], y_pred_adhd):.3f}")

print("\nSex Model Evaluation:")
print(f"  Accuracy: {accuracy_score(y_test['Sex_F'], y_pred_sex):.3f}")
print(f"  Precision: {precision_score(y_test['Sex_F'], y_pred_sex):.3f}")
print(f"  Recall: {recall_score(y_test['Sex_F'], y_pred_sex):.3f}")
print(f"  F1 Score: {f1_score(y_test['Sex_F'], y_pred_sex):.3f}")

ADHD Model Evaluation:
  Accuracy: 0.750
  Precision: 0.813
  Recall: 0.823
  F1 Score: 0.818

Sex Model Evaluation:
  Accuracy: 0.712
  Precision: 0.540
  Recall: 0.746
  F1 Score: 0.626


**(possible) Feature Selection**
<br>

In [None]:
modelA = LogisticRegression(max_iter=1000)
modelA.fit(merged_df.drop(columns='participant_id'), df_target['ADHD_Outcome'])

coefficientsA = pd.Series(modelA.coef_[0], index=merged_df.drop(columns='participant_id').columns)
top_featuresA = coefficientsA.abs().nlargest(10)

print(top_featuresA)

KeyboardInterrupt: 

In [None]:
modelS = LogisticRegression(max_iter=1000)
modelS.fit(merged_df.drop(columns='participant_id'), df_target['Sex_F'])

coefficientsS = pd.Series(modelS.coef_[0], index=merged_df.drop(columns='participant_id').columns)
top_featuresS = coefficientsS.abs().nlargest(10)

print(top_featuresS)

52throw_53thcolumn           0.448153
50throw_53thcolumn           0.443506
121throw_147thcolumn         0.432214
164throw_189thcolumn         0.429034
114throw_121thcolumn         0.427718
53throw_55thcolumn           0.423922
101throw_131thcolumn         0.422798
160throw_190thcolumn         0.411727
MRI_Track_Scan_Location_4    0.407417
Barratt_Barratt_P1_Edu_9     0.404300
dtype: float64


# Performing on Test Set

In [None]:
missing_cols = set(X_train.columns) - set(test_merged_df.columns)
for c in missing_cols:
    test_merged_df[c] = 0
test_merged_df = test_merged_df[X_train.columns]

In [None]:
test_scaled = scaler.fit_transform(test_merged_df)

In [None]:
adhd_predictions = adhd_model.predict(test_merged_df)
sex_predictions = sex_model.predict(test_scaled)

test_predictions_df = pd.DataFrame({
    'participant_id': test_id,
    'ADHD_Outcome': adhd_predictions,
    'Sex_F': sex_predictions
})

print(test_predictions_df.head())

  participant_id  ADHD_Outcome  Sex_F
0   06HFIpqKfXy9             1      0
1   0Dua0TUw4sNZ             1      1
2   0ImS6uhE3Ie9             1      1
3   0VHL9SCh2TfC             1      0
4   0X2H4LroxZcw             1      0


In [None]:
test_predictions_df.to_csv('test_predictions.csv', index=False)