In [1]:
import os
import sys

sys.path.append("../")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from natsort import natsorted
from src.data.DataList import dataset_dict, dist_colors

from sklearn.model_selection import train_test_split

## JBNU preprocessing

In [15]:
BASE = "/NFS/Users/moonsh/data/FLData/"
baseinfo = pd.read_excel(BASE + "JBNUphenotype.xlsx")
img_list = os.listdir(BASE+"Image/JBNU")

baseinfo['Id'] = baseinfo['Id'].map(lambda x: "jbnu_"+str(x).zfill(4))
baseinfo['sex'] = baseinfo['sex'].map({"M": 1, "F":2})
baseinfo.rename(columns={"Id": "Subject", "age": "Age", "sex":"Sex(1=m,2=f)", }, inplace=True)
baseinfo = baseinfo[["Subject", "Age", "Sex(1=m,2=f)", "label"]]
# baseinfo.to_csv(BASE+"JBNU_original_data.csv", index=False)

In [16]:
new_df = pd.DataFrame()
for i in range(len(baseinfo)):
    img_name = f"wm{baseinfo.iloc[i,0]}.nii"
    if img_name in img_list:
        new_row = baseinfo.iloc[i].copy()
        new_row["ImageFile"] = img_name
        new_row = pd.DataFrame(new_row).T
        new_df = pd.concat([new_df, new_row], ignore_index=True, axis=0)

# new_df.to_csv(BASE+"JBNU_original_data.csv", index=False)

In [None]:
hc_info = new_df[new_df['label']=="HC"]
hc_info.drop(columns=["label"], inplace=True)
# hc_info.to_csv(BASE+"JBNU_HC_data.csv", index=False)

In [None]:
train_df, test_df = train_test_split(hc_info, test_size=0.10, random_state=3)
train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=10)

train_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/JBNU_Phenotype_train.csv", index=False)
test_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/JBNU_Phenotype_test.csv", index=False)
val_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/JBNU_Phenotype_val.csv", index=False)

plt.hist(train_df["Age"], bins=10, alpha=0.5, color='orange', label="Train")
plt.hist(test_df["Age"], bins=10, alpha=0.5, color='blue', label="Test")
plt.hist(val_df["Age"], bins=10, alpha=0.5, color='green', label="Val")
plt.legend()
plt.show()

## NKI-RK Preprocessing

In [2]:
BASE = "/NFS/MRI/NKI-RK/phenotype/"
baseinfo = pd.read_csv(BASE + "participants.tsv", sep="\t")
pheno_list = os.listdir(BASE)
pheno_list = [x for x in pheno_list if x.endswith(".tsv") and x.startswith("sub")]
img_list = os.listdir("/NFS/MRI/NKI-RK/preprocess/cat12/mri/")

In [None]:
baseinfo = baseinfo['participant_id,sex,handedness'].str.split(',', expand=True)
baseinfo.columns = ['participant_id', 'sex', 'handedness']
baseinfo['sex'] = baseinfo['sex'].map({'M': 1, 'F': 2})
baseinfo

In [7]:
result_df = pd.DataFrame()
for p in pheno_list:
    csvfile = os.path.join(BASE, p)
    pheno = pd.read_csv(BASE + p, sep="\t")
    data = pheno[pheno['session']=='BAS1']
    if data.shape[0] == 0:
        data = pheno[pheno['session']=='BAS2']
    if data.shape[0] == 1:
        ID = "sub-"+data['id'].values[0]
        session = data['session'].values[0]
        sex = baseinfo[baseinfo['participant_id']==ID]['sex'].values[0]
        imgfile = f"wm{ID}_ses-{session}_T1w.nii"

        if imgfile not in img_list:
            continue

        new_row = pd.DataFrame([{"Subject": "sub-"+data['id'].values[0], "Sex(1=m,2=f)": sex,
                                "Age": data['age'].values[0],
                                "Handedness": data['handedness_score'].values[0],
                                "Session": data['session'].values[0],
                                "ImageFile": imgfile
                                }])
        result_df = pd.concat([result_df, new_row], ignore_index=True, axis=0)

In [8]:
result_df.dropna(inplace=True, axis=0)
result_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_total.csv", index=False)

In [None]:
train_df, test_df = train_test_split(result_df, test_size=0.10, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=42)

train_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_train.csv", index=False)
test_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_test.csv", index=False)
val_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_val.csv", index=False)

plt.hist(train_df["Age"], bins=10, alpha=0.5, color='orange', label="Train")
plt.hist(test_df["Age"], bins=10, alpha=0.5, color='blue', label="Test")
plt.hist(val_df["Age"], bins=10, alpha=0.5, color='green', label="Val")
plt.legend()
plt.show()

## EDA


In [2]:
BASE = "/NFS/Users/moonsh/data/FLData/"
phenoPATH = BASE + "Phenotype"

oasis_3_Hc = pd.read_csv(BASE + "health_subjects_CDRTOT0.csv")
hc_oas3 = oasis_3_Hc["OASISID"].unique()

In [3]:
total_list = [csv for csv in os.listdir(phenoPATH) if csv.endswith("total.csv") and csv.split("_")[0] in dataset_dict.keys()]
# train_list = [csv for csv in os.listdir(phenoPATH) if csv.endswith("train.csv") and csv.split("_")[0] in dataset_dict.keys()]
# test_list = [csv for csv in os.listdir(phenoPATH) if csv.endswith("test.csv") and csv.split("_")[0] in dataset_dict.keys()]
# val_list = [csv for csv in os.listdir(phenoPATH) if csv.endswith("val.csv") and csv.split("_")[0] in dataset_dict.keys()]

In [4]:
df_list = []
for i in range(10):    
    dataset_name = total_list[i].split("_")[0]
    
    img_list = os.listdir(os.path.join(BASE, "Image", dataset_name))
    total_df = pd.read_csv(phenoPATH + "/" + total_list[i])
    # total_df = pd.read_csv(phenoPATH + "/" + total_list[i])
    subject_list = total_df["Subject"].values

    use_subject_list = []
    imgfile_list = []
    for subject in subject_list:
        if dataset_name == "OAS3":
            if subject not in hc_oas3:
                continue
        for imgfile in img_list:
            if str(subject) in imgfile:
                use_subject_list.append(subject)
                imgfile_list.append(imgfile)
                break

    total_df = total_df[total_df["Subject"].isin(use_subject_list)]
    total_df = total_df.reset_index(drop=True)
    total_df["ImageFile"] = imgfile_list
    df_list.append(total_df)
    # total_df.to_csv(phenoPATH + "/" + dataset_name + "_Phenotype_total.csv", index=False)

In [None]:
color = dist_colors
fig, axs = plt.subplots(2,5 , figsize=(20, 8))

for i in range(10):
    ax = axs[i // 5, i % 5]
    
    dataset_name = total_list[i].split("_")[0]
    # print(dataset_name)
    total_df = pd.read_csv(phenoPATH + "/" + total_list[i])

    train_df, test_df = train_test_split(total_df, test_size=0.10, random_state=12)
    train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=12)

    print(dataset_name, len(train_df), len(test_df), len(val_df))

    ax.hist(train_df["Age"], bins=10, alpha=0.5, color='orange', label="Train")
    ax.hist(test_df["Age"], bins=10, alpha=0.5, color='blue', label="Test")
    ax.hist(val_df["Age"], bins=10, alpha=0.5, color='green', label="Val")
    ax.set_title(dataset_name)
 
    # print(round(len(train_df) / len(total_df), 2), round(len(test_df) / len(total_df), 2), round(len(val_df) / len(total_df), 2))

    # train_df.to_csv(phenoPATH + "/" + dataset_name + "_Phenotype_train.csv", index=False)
    # test_df.to_csv(phenoPATH + "/" + dataset_name + "_Phenotype_test.csv", index=False)
    # val_df.to_csv(phenoPATH + "/" + dataset_name + "_Phenotype_val.csv", index=False)

In [None]:
color = dist_colors
fig, axs = plt.subplots(2,5 , figsize=(20, 8))

for i, csvPATH in enumerate(total_list):
    ax = axs[i//5, i%5]
    csv = pd.read_csv(phenoPATH + "/" + csvPATH)
    if csvPATH.split("_")[0] == "SLIM":
        ax.hist(csv["Age"], bins=5, alpha=1, label=csvPATH.split("_")[0], color=color[i])
    else:
        ax.hist(csv["Age"], bins=5, alpha=1, label=csvPATH.split("_")[0], color=color[i])
    ax.set_title(csvPATH.split("_")[0])
    ax.set_xlim(0, 100)
    ax.text(0.75, 0.8, f"n={len(csv)}", transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8))

plt.show()