In [None]:
import os
import sys

sys.path.append("../")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from natsort import natsorted
from src.data.DataList import dataset_dict, dist_colors, ind_dataset_dict

from sklearn.model_selection import train_test_split

import shutil

## JBNU preprocessing

In [None]:
BASE = "/NFS/Users/moonsh/data/FLData/"
baseinfo = pd.read_excel(BASE + "JBNUphenotype.xlsx")
img_list = os.listdir(BASE+"Image/JBNU")

baseinfo['Id'] = baseinfo['Id'].map(lambda x: "jbnu_"+str(x).zfill(4))
baseinfo['sex'] = baseinfo['sex'].map({"M": 1, "F":2})
baseinfo.rename(columns={"Id": "Subject", "age": "Age", "sex":"Sex(1=m,2=f)", }, inplace=True)
baseinfo = baseinfo[["Subject", "Age", "Sex(1=m,2=f)", "label"]]
# baseinfo.to_csv(BASE+"JBNU_original_data.csv", index=False)

In [None]:
new_df = pd.DataFrame()
for i in range(len(baseinfo)):
    img_name = f"wm{baseinfo.iloc[i,0]}.nii"
    if img_name in img_list:
        new_row = baseinfo.iloc[i].copy()
        new_row["ImageFile"] = img_name
        new_row = pd.DataFrame(new_row).T
        new_df = pd.concat([new_df, new_row], ignore_index=True, axis=0)

# new_df.to_csv(BASE+"JBNU_original_data.csv", index=False)

In [None]:
hc_info = new_df[new_df['label']=="HC"]
hc_info.drop(columns=["label"], inplace=True)
# hc_info.to_csv(BASE+"JBNU_HC_data.csv", index=False)

In [None]:
train_df, test_df = train_test_split(hc_info, test_size=0.10, random_state=3)
train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=10)

train_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/JBNU_Phenotype_train.csv", index=False)
test_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/JBNU_Phenotype_test.csv", index=False)
val_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/JBNU_Phenotype_val.csv", index=False)

plt.hist(train_df["Age"], bins=10, alpha=0.5, color='orange', label="Train")
plt.hist(test_df["Age"], bins=10, alpha=0.5, color='blue', label="Test")
plt.hist(val_df["Age"], bins=10, alpha=0.5, color='green', label="Val")
plt.legend()
plt.show()

## NKI-RK Preprocessing

In [None]:
BASE = "/NFS/MRI/NKI-RK/phenotype/"
baseinfo = pd.read_csv(BASE + "participants.tsv", sep="\t")
pheno_list = os.listdir(BASE)
pheno_list = [x for x in pheno_list if x.endswith(".tsv") and x.startswith("sub")]
img_list = os.listdir("/NFS/MRI/NKI-RK/preprocess/cat12/mri/")

In [None]:
baseinfo = baseinfo['participant_id,sex,handedness'].str.split(',', expand=True)
baseinfo.columns = ['participant_id', 'sex', 'handedness']
baseinfo['sex'] = baseinfo['sex'].map({'M': 1, 'F': 2})
baseinfo

In [None]:
result_df = pd.DataFrame()
for p in pheno_list:
    csvfile = os.path.join(BASE, p)
    pheno = pd.read_csv(BASE + p, sep="\t")
    data = pheno[pheno['session']=='BAS1']
    if data.shape[0] == 0:
        data = pheno[pheno['session']=='BAS2']
    if data.shape[0] == 1:
        ID = "sub-"+data['id'].values[0]
        session = data['session'].values[0]
        sex = baseinfo[baseinfo['participant_id']==ID]['sex'].values[0]
        imgfile = f"wm{ID}_ses-{session}_T1w.nii"

        if imgfile not in img_list:
            continue

        new_row = pd.DataFrame([{"Subject": "sub-"+data['id'].values[0], "Sex(1=m,2=f)": sex,
                                "Age": data['age'].values[0],
                                "Handedness": data['handedness_score'].values[0],
                                "Session": data['session'].values[0],
                                "ImageFile": imgfile
                                }])
        result_df = pd.concat([result_df, new_row], ignore_index=True, axis=0)

In [None]:
result_df.dropna(inplace=True, axis=0)
result_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_total.csv", index=False)

In [None]:
train_df, test_df = train_test_split(result_df, test_size=0.10, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=42)

train_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_train.csv", index=False)
test_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_test.csv", index=False)
val_df.to_csv("/NFS/Users/moonsh/data/FLData/Phenotype/NKI-RK_Phenotype_val.csv", index=False)

plt.hist(train_df["Age"], bins=10, alpha=0.5, color='orange', label="Train")
plt.hist(test_df["Age"], bins=10, alpha=0.5, color='blue', label="Test")
plt.hist(val_df["Age"], bins=10, alpha=0.5, color='green', label="Val")
plt.legend()
plt.show()

## Independent Dataset

In [None]:
BASE = "/NFS/Users/moonsh/thesis/data/"
DATASETNAME = os.listdir(BASE)

In [None]:
dataname = "OAS2"
Pheno1 = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total_base.csv"))
# Pheno2 = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total_base.csv"), encoding='latin-1')
# Pheno3 = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_MCI.csv" ))

# merge_df = pd.concat([Pheno1, Pheno2, Pheno3], ignore_index=True, axis=0)
# merge_df.drop(["ImageFile", 'Visit', 'Modality', 'Description', 'Type', 'Acq Date', "Format"], axis=1, inplace=True)

# # merge_df.rename({"Group": 'Control',
#                  }, inplace=True)
# merge_df['Sex(1=m,2=f)'] = merge_df['Sex(1=m,2=f)'].map({"M":1, "F":2})

# Pheno2.drop(['SUB_STUDY', "SUB_TYPE"], axis=1, inplace=True)

# merge_df = pd.merge(Pheno1, Pheno2, on="Subject", how="left")

In [None]:
Pheno1.rename({"Sex": "Sex(1=m,2=f)"}, axis=1, inplace=True)
# Pheno1['Sex(1=m,2=f)'] = Pheno1['Sex(1=m,2=f)'].map({"male":1, "female":2})

In [None]:
Pheno1

In [None]:
Pheno1['ImageFile'] = Pheno1['ImageFile'].map(lambda x: x.split("/")[-1])

In [None]:
Pheno1

In [None]:
Pheno1.to_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total_base.csv"), index=False)

In [None]:
# Pheno1.dropna(subset=["Age", "Sex(1=m,2=f)"], axis=0, inplace=True)

In [None]:
# img_path = os.path.join("/NFS/Users/moonsh/data/FLData/Image/", dataname)
img_path = os.path.join("/NFS/Users/moonsh/thesis/data", dataname, "MRI")
img_list = os.listdir(img_path)
len(img_list)

In [None]:
import time

new_df = pd.DataFrame()
for i in range(len(Pheno1)):
    # img_name = f"wmsub-{Pheno1.iloc[i,0]}"
    img_name = f"wm{Pheno1.iloc[i,1]}_{Pheno1.iloc[i,0]}.nii"
    order = Pheno1.iloc[i, -1]

    
    for img_l in img_list:
        if img_name in img_l:
            new_row = Pheno1.iloc[i].copy()
            new_row["ImageFile"] = img_l
            new_row = pd.DataFrame(new_row).T
            new_df = pd.concat([new_df, new_row], ignore_index=True, axis=0)
            # shutil.copy(os.path.join(img_path, img_name), os.path.join(BASE, dataname, "MRI", img_name))
            # time.sleep(0.1)
            break

In [None]:
new_df

In [None]:
new_df['Control'].value_counts()

In [None]:
# new_df.rename({'Group':'Control'}, inplace=True, axis=1)
new_df['Control'] = new_df['Control'].map({"No_Known_Disorder": "HC", 
                                           "Schizophrenia_Strict": "SCZ",})

In [None]:
new_df['Control'].value_counts()

In [None]:
new_df.dropna(subset=['Control'], axis=0, inplace=True)

In [None]:
new_df.sample(30)

In [None]:
new_df.to_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total.csv"), index=False)

## EDA

In [None]:
BASE = "/NFS/Users/moonsh/thesis/data/"
DATASETNAME = os.listdir(BASE)

In [None]:
train_test_data = list(dataset_dict.keys())
ind_data = ['ADNI', 'COBRE', 'MCIC', 'NUSDAST', 'OAS4', 'CoRR', 'fcon1000', 'OAS2', 'SLIM',]

In [None]:
# fig, axs = plt.subplots(2, 5, figsize=(20, 8))
# axs = axs.flatten()

# colors = sns.color_palette("Set3", len(train_test_data))

# N = 0
# M_N = 0
# F_N = 0
# AGE = []


# for i in range(len(train_test_data)):
#     dataname = train_test_data[i]
#     df = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total.csv"))
#     print(dataname)
#     print(df['Age'].mean().round(2), "±", df['Age'].std().round(2))
#     print(len(df[df['Sex(1=m,2=f)']==1]), '/', len(df[df['Sex(1=m,2=f)']==2]))
#     print(int(df['Age'].min()),"-", int(df['Age'].max()))
#     print(len(df))
#     print("--------------------")

#     N += len(df)
#     M_N += len(df[df['Sex(1=m,2=f)']==1])
#     F_N += len(df[df['Sex(1=m,2=f)']==2])
#     AGE.extend(df['Age'].tolist())

#     axs[i].hist(df["Age"], bins=10, alpha=0.5, color=colors[i])

#     min_age = df["Age"].min()//5 * 5
#     max_age = df["Age"].max()//5 * 5

#     xticks = np.linspace(min_age, max_age, 6)
#     axs[i].set_xticks(xticks)

#     axs[i].set_title(dataname, fontsize=15)
#     axs[i].text(0.75, 0.9, f"N={df.shape[0]}", fontsize=12, transform=axs[i].transAxes,
#                 bbox=dict(facecolor='white', alpha=0.5))


# fig.supylabel("Frequency", fontsize=18, position=(-0.00001, 0.5))
# fig.supxlabel("Age", fontsize=18, position=(0.5, -0.00001))
# plt.tight_layout()

# plt.savefig("/NFS/Users/moonsh/thesis/asset/age_distribution.png", dpi=600, bbox_inches='tight', pad_inches=0.1)
# plt.show()

# print(np.array(AGE).mean())
# print(np.array(AGE).std())

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(15, 15))
axs = axs.flatten()

colors = sns.color_palette("Set3", len(ind_data)+10)

N = 0
M_N = 0
F_N = 0
AGE = []


for i in range(len(ind_data)):
    dataname = ind_data[i]

    if dataname == 'OAS2' or dataname == 'SLIM':
        base = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total_base.csv"))
        follow = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total_follow.csv"))

        axs[i].hist(base["Age"], bins=10, alpha=0.5, color=colors[i], label="Base")
        axs[i].hist(follow["Age"], bins=10, alpha=0.5, color=colors[i+10], label='Follow-up')

        min_age = base["Age"].min()//5 * 5
        max_age = follow["Age"].max()//5 * 5

        print(dataname)
        print(base['Age'].mean().round(2), "±", base['Age'].std().round(2))
        print(follow['Age'].mean().round(2), "±", follow['Age'].std().round(2))

        print(len(base[base['Sex(1=m,2=f)']==1]), '/', len(base[base['Sex(1=m,2=f)']==2]))

        print(int(base['Age'].min()),"-", int(base['Age'].max()))
        print(int(follow['Age'].min()),"-", int(follow['Age'].max()))
        print(len(follow))
        print("--------------------")

        if dataname == 'SLIM':
            min_age = base["Age"].min()
            max_age = follow["Age"].max() + 3
            xticks = np.linspace(min_age, max_age, 6)
        else:
            xticks = np.linspace(min_age, max_age, 6)
        axs[i].set_xticks(xticks)

        axs[i].set_title(dataname, fontsize=15)
        axs[i].text(0.80, 0.87, f"N={base.shape[0]}", fontsize=12, transform=axs[i].transAxes,
                    bbox=dict(facecolor='white', alpha=0.5))
        
        axs[i].legend(loc='upper left')

    else:
        df = pd.read_csv(os.path.join(BASE, dataname, f"{dataname}_Phenotype_total.csv"))
        print(dataname)
        print(df['Age'].mean().round(2), "±", df['Age'].std().round(2))
        print(len(df[df['Sex(1=m,2=f)']==1]), '/', len(df[df['Sex(1=m,2=f)']==2]))
        print(int(df['Age'].min()),"-", int(df['Age'].max()))
        print(len(df))
        print("--------------------")

        N += len(df)
        M_N += len(df[df['Sex(1=m,2=f)']==1])
        F_N += len(df[df['Sex(1=m,2=f)']==2])
        AGE.extend(df['Age'].tolist())

        axs[i].hist(df["Age"], bins=10, alpha=0.5, color=colors[i])

        min_age = df["Age"].min()//5 * 5
        max_age = df["Age"].max()//5 * 5

        xticks = np.linspace(min_age, max_age, 6)
        axs[i].set_xticks(xticks)

        axs[i].set_title(dataname, fontsize=15)
        axs[i].text(0.80, 0.9, f"N={df.shape[0]}", fontsize=12, transform=axs[i].transAxes,
                    bbox=dict(facecolor='white', alpha=0.5))


fig.supylabel("Frequency", fontsize=18, position=(-0.00001, 0.5))
fig.supxlabel("Age", fontsize=18, position=(0.5, -0.00001))
plt.tight_layout()

plt.savefig("/NFS/Users/moonsh/thesis/asset/ind_age_distribution.png", dpi=600, bbox_inches='tight', pad_inches=0.1)
plt.show()

print(np.array(AGE).mean())
print(np.array(AGE).std())