In [1]:
import pandas as pd
import pickle
import argparse
import os
import glob
import pydicom
import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
combined_roi_dict = {
    'Left-Cingulate': {
        'ctx-lh-caudalanteriorcingulate',
        'ctx-lh-rostralanteriorcingulate',
        'ctx-lh-isthmuscingulate',
        'ctx-lh-posteriorcingulate',
    },
    'Right-Cingulate': {
        'ctx-rh-caudalanteriorcingulate',
        'ctx-rh-rostralanteriorcingulate',
        'ctx-rh-isthmuscingulate',
        'ctx-rh-posteriorcingulate',
    },
    'Left-Frontal': {
        'ctx-lh-caudalmiddlefrontal',
        'ctx-lh-lateralorbitofrontal',
        'ctx-lh-medialorbitofrontal',
        'ctx-lh-parsopercularis',
        'ctx-lh-parsorbitalis',
        'ctx-lh-parstriangularis',
        'ctx-lh-precentral',
        'ctx-lh-rostralmiddlefrontal',
        'ctx-lh-superiorfrontal',
        'ctx-lh-paracentral'
    },
    'Right-Frontal': {
        'ctx-rh-caudalmiddlefrontal',
        'ctx-rh-lateralorbitofrontal',
        'ctx-rh-medialorbitofrontal',
        'ctx-rh-parsopercularis',
        'ctx-rh-parsorbitalis',
        'ctx-rh-parstriangularis',
        'ctx-rh-precentral',
        'ctx-rh-rostralmiddlefrontal',
        'ctx-rh-superiorfrontal',
        'ctx-rh-paracentral'
    },
    'Left-Temporal': {
        'ctx-lh-entorhinal',
        'ctx-lh-fusiform',
        'ctx-lh-inferiortemporal',
        'ctx-lh-middletemporal',
        'ctx-lh-superiortemporal',
        'ctx-lh-transversetemporal',
        'ctx-lh-parahippocampal',
        'Left-Hippocampus',
        'Left-Amygdala',
    },
    'Right-Temporal': {
        'ctx-rh-entorhinal',
        'ctx-rh-fusiform',
        'ctx-rh-inferiortemporal',
        'ctx-rh-middletemporal',
        'ctx-rh-superiortemporal',
        'ctx-rh-transversetemporal',
        'ctx-rh-parahippocampal',
        'Right-Hippocampus',
        'Right-Amygdala',
    },
    'Left-Parietal': {
        'ctx-lh-inferiorparietal',
        'ctx-lh-postcentral',
        'ctx-lh-superiorparietal',
        'ctx-lh-supramarginal',
        'ctx-lh-precuneus'
    },
    'Right-Parietal': {
        'ctx-rh-inferiorparietal',
        'ctx-rh-postcentral',
        'ctx-rh-superiorparietal',
        'ctx-rh-supramarginal',
        'ctx-rh-precuneus'
    },
    'Left-Occipital': {
        'ctx-lh-cuneus',
        'ctx-lh-lateraloccipital',
        'ctx-lh-lingual',
        'ctx-lh-pericalcarine'
    },
    'Right-Occipital': {
        'ctx-rh-cuneus',
        'ctx-rh-lateraloccipital',
        'ctx-rh-lingual',
        'ctx-rh-pericalcarine'
    },
    'Left-Insula':{
        'ctx-lh-insula'
    },
    'Right-Insula': {
        'ctx-rh-insula'
    },
    'Left-Cerebellum':{
        'Left-Cerebellum-White-Matter',
        'Left-Cerebellum-Cortex'
    },
    'Right-Cerebellum': {
        'Right-Cerebellum-White-Matter',
        'Right-Cerebellum-Cortex'
    },
    'Left-Combined-Cerebral-WM': {
        'Left-Cerebral-White-Matter',
        'Left-WM-hypointensities'
    },
    'Right-Combined-Cerebral-WM': {
        'Right-Cerebral-White-Matter',
        'Right-WM-hypointensities'
    },
    'Left-Total-Lateral-Ventricle': {
        'Left-Lateral-Ventricle',
        'Left-Inf-Lat-Vent'
    },
    'Right-Total-Lateral-Ventricle': {
        'Right-Lateral-Ventricle',
        'Right-Inf-Lat-Vent'
    },
}
gray_matter_index = {
    'left_cortical_gm': {
        'left_cingulate',
        'left_frontal',
        'left_temporal',
        'left_parietal',
        'left_occipital',
        'left_insula'
    },
    'right_cortical_gm': {
        'right_cingulate',
        'right_frontal',
        'right_temporal',
        'right_parietal',
        'right_occipital',
        'right_insula'
    }
}

combined_roi_display_dict = {
    'Left-Cingulate': {
        'ctx-lh-caudalanteriorcingulate',
        'ctx-lh-rostralanteriorcingulate',
        'ctx-lh-isthmuscingulate',
        'ctx-lh-posteriorcingulate',
    },
    'Right-Cingulate': {
        'ctx-rh-caudalanteriorcingulate',
        'ctx-rh-rostralanteriorcingulate',
        'ctx-rh-isthmuscingulate',
        'ctx-rh-posteriorcingulate',
    },
    'Left-Frontal': {
        'ctx-lh-caudalmiddlefrontal',
        'ctx-lh-lateralorbitofrontal',
        'ctx-lh-medialorbitofrontal',
        'ctx-lh-parsopercularis',
        'ctx-lh-parsorbitalis',
        'ctx-lh-parstriangularis',
        'ctx-lh-precentral',
        'ctx-lh-rostralmiddlefrontal',
        'ctx-lh-superiorfrontal',
        'ctx-lh-paracentral'
    },
    'Right-Frontal': {
        'ctx-rh-caudalmiddlefrontal',
        'ctx-rh-lateralorbitofrontal',
        'ctx-rh-medialorbitofrontal',
        'ctx-rh-parsopercularis',
        'ctx-rh-parsorbitalis',
        'ctx-rh-parstriangularis',
        'ctx-rh-precentral',
        'ctx-rh-rostralmiddlefrontal',
        'ctx-rh-superiorfrontal',
        'ctx-rh-paracentral'
    },
    'Left-Temporal': {
        'ctx-lh-entorhinal',
        'ctx-lh-fusiform',
        'ctx-lh-inferiortemporal',
        'ctx-lh-middletemporal',
        'ctx-lh-superiortemporal',
        'ctx-lh-transversetemporal',
        'ctx-lh-parahippocampal',
    },
    'Right-Temporal': {
        'ctx-rh-entorhinal',
        'ctx-rh-fusiform',
        'ctx-rh-inferiortemporal',
        'ctx-rh-middletemporal',
        'ctx-rh-superiortemporal',
        'ctx-rh-transversetemporal',
        'ctx-rh-parahippocampal',
    },
    'Left-Parietal': {
        'ctx-lh-inferiorparietal',
        'ctx-lh-postcentral',
        'ctx-lh-superiorparietal',
        'ctx-lh-supramarginal',
        'ctx-lh-precuneus'
    },
    'Right-Parietal': {
        'ctx-rh-inferiorparietal',
        'ctx-rh-postcentral',
        'ctx-rh-superiorparietal',
        'ctx-rh-supramarginal',
        'ctx-rh-precuneus'
    },
    'Left-Occipital': {
        'ctx-lh-cuneus',
        'ctx-lh-lateraloccipital',
        'ctx-lh-lingual',
        'ctx-lh-pericalcarine'
    },
    'Right-Occipital': {
        'ctx-rh-cuneus',
        'ctx-rh-lateraloccipital',
        'ctx-rh-lingual',
        'ctx-rh-pericalcarine'
    },
    'Left-Insula':{
        'ctx-lh-insula'
    },
    'Right-Insula': {
        'ctx-rh-insula'
    },
    'Left-Cerebellum':{
        'Left-Cerebellum-White-Matter',
        'Left-Cerebellum-Cortex'
    },
    'Right-Cerebellum': {
        'Right-Cerebellum-White-Matter',
        'Right-Cerebellum-Cortex'
    },
    'Left-Combined-Cerebral-WM': {
        'Left-Cerebral-White-Matter',
        'Left-WM-hypointensities'
    },
    'Right-Combined-Cerebral-WM': {
        'Right-Cerebral-White-Matter',
        'Right-WM-hypointensities'
    }
}

In [3]:
def load_pickle_file(filepath):
    with open(filepath, 'rb') as file:
        data = pickle.load(file)
    return data
# 파일열어보기 , PICKLE로 되어있는것

In [26]:
import os
import pandas as pd
import pickle

def load_pickle_file(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)
        
def find_volumes_files(root_path):
    volumes_files = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file == 'volumes.pickle':
                volumes_files.append(os.path.join(root, file))
    return volumes_files

def extract_subject_data(volumes_file_path):
    subject_name = os.path.dirname(volumes_file_path).split(os.sep)[-3]
    volumes_data = load_pickle_file(volumes_file_path)
    volumes_data['subject'] = subject_name
    return volumes_data

def compile_data(volumes_files):
    data = []
    for volumes_file in volumes_files:
        subject_data = extract_subject_data(volumes_file)
        data.append(subject_data)
    df = pd.DataFrame(data)
    # You might need to adjust the DataFrame to ensure it has the structure you want
    return df

root_path = '/home/limseoyoung/drwelly1/SeoyoungLim/drwelly-debug/drwelly_OASIS2_before_swift_20240408'
volumes_files = find_volumes_files(root_path)
df = compile_data(volumes_files)


In [27]:
df['subject'] = df['subject'].str.replace('_Tag_pr100_sr100_ppf100_spf100_ipf100_sipf179', '')


In [28]:
df

Unnamed: 0,total_intracranial_volume,unknown,left_cerebral_white_matter,left_lateral_ventricle,left_inf_lat_vent,left_cerebellum_white_matter,left_cerebellum_cortex,left_thalamus,left_caudate,left_putamen,...,cerebellum,cerebellum_white_matter,cerebellum_cortex,lateral_ventricle,inf_lat_vent,total_lateral_ventricle,wm_hypointensities,HippocampusOccupancyScore,HippocampusAsymmetryIndex,subject
0,1.443881e+06,1.537269e+07,185202.840538,25318.200707,1787.029909,9328.227523,39829.500965,6381.171674,3250.953834,4474.434771,...,98762.728954,18141.955073,80620.773881,49401.601476,3490.710822,52892.312298,7220.835631,0.593842,0.040301,OAS2_0063_MR2_mpr-1
1,1.414752e+06,1.540181e+07,190451.769270,8951.613543,928.157953,10023.488488,46148.246642,6911.449647,2875.711853,4994.422745,...,111261.305316,19681.338994,91579.966321,17142.796124,1778.797909,18921.594033,1304.771933,0.785388,0.026640,OAS2_0174_MR3_mpr-2
2,1.460055e+06,1.535651e+07,199021.280832,15967.335184,1325.351932,11514.852412,51197.549384,6187.719684,3683.133812,4339.978778,...,126441.112540,22598.896845,103842.215695,31464.074393,2739.883860,34203.958253,3023.201846,0.691609,0.001787,OAS2_0056_MR2_mpr-2
3,1.496985e+06,1.531958e+07,212807.822128,33314.216298,1983.911899,10724.237452,45802.845660,6791.399653,3199.503837,4049.800793,...,112692.987243,20929.858931,91763.128312,64415.396709,3585.035817,68000.432526,26518.700645,0.588046,0.082106,OAS2_0053_MR2_mpr-2
4,1.582709e+06,1.523386e+07,213230.741106,19041.301027,1009.105948,12694.772351,49138.177490,7028.412641,3231.059835,4593.455765,...,123982.831666,24906.600728,99076.230938,37320.113093,2036.733896,39356.846989,2322.109881,0.791433,0.001331,OAS2_0090_MR2_mpr-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,1.501324e+06,1.531524e+07,186479.829473,24784.149734,1743.811911,9129.630534,45565.489672,5489.028720,3035.549845,4230.904784,...,107979.481483,17738.587094,90240.894390,45783.294661,3019.428846,48802.723507,7443.785620,0.689620,0.034415,OAS2_0080_MR1_mpr-1
1284,1.516973e+06,1.529959e+07,196881.989941,25637.533690,1686.873914,11146.813431,45632.374669,6977.305644,4311.509780,4865.797751,...,113164.269218,21736.251890,91428.017329,50271.792432,3190.242837,53462.035269,19437.466007,0.658749,0.015160,OAS2_0178_MR3_mpr-3
1285,1.450921e+06,1.536565e+07,195842.356994,31444.523394,2052.854895,16265.402169,50961.565396,6924.140646,3790.835806,4222.329784,...,132656.958223,31777.919376,100879.038846,57006.254088,4389.370776,61395.624863,8628.850559,0.626252,0.070949,OAS2_0046_MR1_mpr-3
1286,1.372976e+06,1.544359e+07,182240.692689,14579.557255,1076.333945,9566.269511,39490.959982,6133.868687,2937.108850,4088.216791,...,99779.723902,18979.561030,80800.162872,28458.708546,2045.308896,30504.017442,2996.790847,0.769164,0.028514,OAS2_0027_MR3_mpr-1


In [45]:
# 'mpr-x'에 따라 4개의 데이터프레임으로 분할
volume_df_mpr1 = df[df['subject'].str.contains('MR1')]
volume_df_mpr2 = df[df['subject'].str.contains('MR2')]
volume_df_mpr3 = df[df['subject'].str.contains('MR3')]
volume_df_mpr4 = df[df['subject'].str.contains('MR4')]

In [47]:
print(volume_df_mpr1.shape)
print(volume_df_mpr2.shape)
print(volume_df_mpr3.shape)
print(volume_df_mpr4.shape)

(540, 145)
(505, 145)
(181, 145)
(45, 145)


In [48]:

def extract_number(subj):
    parts = subj.split('_')  # '_'를 기준으로 문자열 분리
    if len(parts) > 1:
        number_part = parts[1]  # "OAS1_" 다음 오는 부분을 선택
        # 숫자만 추출
        numbers = ''.join(filter(str.isdigit, number_part))
        return int(numbers) if numbers.isdigit() else None
    return None

In [49]:

volume_df_mpr1 ['sort_key'] = volume_df_mpr1['subject'].apply(extract_number)  # 정렬 키 생성
volume_df_sorted1 = volume_df_mpr1 .sort_values(by='sort_key').drop(columns=['sort_key'])  # 정렬 및 정렬 키 컬럼 삭제


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume_df_mpr1 ['sort_key'] = volume_df_mpr1['subject'].apply(extract_number)  # 정렬 키 생성


In [50]:

volume_df_mpr2 ['sort_key'] = volume_df_mpr2['subject'].apply(extract_number)  # 정렬 키 생성
volume_df_sorted2 = volume_df_mpr2.sort_values(by='sort_key').drop(columns=['sort_key'])  # 정렬 및 정렬 키 컬럼 삭제


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume_df_mpr2 ['sort_key'] = volume_df_mpr2['subject'].apply(extract_number)  # 정렬 키 생성


In [51]:

volume_df_mpr3['sort_key'] = volume_df_mpr3['subject'].apply(extract_number)  # 정렬 키 생성
volume_df_sorted3 = volume_df_mpr3.sort_values(by='sort_key').drop(columns=['sort_key'])  # 정렬 및 정렬 키 컬럼 삭제


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume_df_mpr3['sort_key'] = volume_df_mpr3['subject'].apply(extract_number)  # 정렬 키 생성


In [52]:
volume_df_mpr4['sort_key'] = volume_df_mpr4['subject'].apply(extract_number)  # 정렬 키 생성
volume_df_sorted4 = volume_df_mpr4.sort_values(by='sort_key').drop(columns=['sort_key'])  # 정렬 및 정렬 키 컬럼 삭제


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume_df_mpr4['sort_key'] = volume_df_mpr4['subject'].apply(extract_number)  # 정렬 키 생성


In [53]:

df['sort_key'] = df['subject'].apply(extract_number)  # 정렬 키 생성
df_final= df .sort_values(by='sort_key').drop(columns=['sort_key'])  # 정렬 및 정렬 키 컬럼 삭제


In [54]:
import numpy as np

In [55]:
import numpy as np

In [56]:
def update_df_columns_to_roi(df, roi_dict):
    columns_to_remove = []  # 제거할 원본 컬럼들을 저장할 리스트
    new_columns_data = {}   # 새로운 컬럼의 데이터를 저장할 사전

    # 데이터프레임의 모든 컬럼을 순회
    for col in df.columns:
        matched = False
        for roi, sub_columns in roi_dict.items():
            # 하이픈을 언더스코어로 변경한 후 매칭 시도
            if any(col.endswith(sub_col.replace('-', '_')) for sub_col in sub_columns):
                matched = True
                # 새로운 컬럼 데이터에 합산 처리
                if roi not in new_columns_data:
                    new_columns_data[roi] = df[col].fillna(0)
                else:
                    new_columns_data[roi] += df[col].fillna(0)
                
                # 컬럼 제거 리스트에 추가
                if col not in columns_to_remove:
                    columns_to_remove.append(col)
                break  # 하위 요소에 매치되면 더 이상의 검사 중단
        if not matched:
            # 매치되지 않은 컬럼은 새로운 데이터에 그대로 유지
            new_columns_data[col] = df[col]

    # 새로운 데이터프레임 생성
    new_df = pd.DataFrame(new_columns_data)

    return new_df

# 함수 실행하여 데이터프레임 업데이트, 'subject' 컬럼 유지
new_df1 = update_df_columns_to_roi(volume_df_mpr1.copy(), combined_roi_display_dict)
new_df2 = update_df_columns_to_roi(volume_df_mpr2.copy(), combined_roi_display_dict)

new_df3 = update_df_columns_to_roi(volume_df_mpr3.copy(), combined_roi_display_dict)
new_df4 = update_df_columns_to_roi(volume_df_mpr4.copy(), combined_roi_display_dict)

In [57]:
columns_to_include = [
   'whole_brain',	'cortical_gm',	'combined_cerebral_wm',	'frontal','temporal','parietal'	,'occipital','insula'	,'thalamus'	,'caudate','putamen','pallidum','hippocampus','amygdala', 'accumbens_area'	, 'ventraldc' ,'cingulate' ,'cerebellum', 'cerebellum_white_matter' ,'cerebellum_cortex','lateral_ventricle', 'inf_lat_vent',	'total_lateral_ventricle', 'wm_hypointensities',	'HippocampusOccupancyScore',	
    'subject'  # 'subject' 컬럼 추가
]

filtered_df1 = volume_df_mpr1[columns_to_include].copy()
filtered_df2 = volume_df_mpr2[columns_to_include].copy()
filtered_df3 = volume_df_mpr3[columns_to_include].copy()
filtered_df4 = volume_df_mpr4[columns_to_include].copy()

In [58]:
#filtered_df1.to_csv('session1_oasis2_atropy_20240418_before.csv')

In [59]:
#filtered_df2.to_csv('session2_oasis2_atropy_20240418_before.csv')

In [60]:
#filtered_df3.to_csv('session3_oasis2_atropy_20240418_before.csv')

In [61]:
#filtered_df4.to_csv('session4_oasis2_atropy_20240418_before.csv')

In [24]:
df_all = df[columns_to_include].copy()

In [25]:
df_all

Unnamed: 0,whole_brain,cortical_gm,combined_cerebral_wm,frontal,temporal,parietal,occipital,insula,thalamus,caudate,...,cingulate,cerebellum,cerebellum_white_matter,cerebellum_cortex,lateral_ventricle,inf_lat_vent,total_lateral_ventricle,wm_hypointensities,HippocampusOccupancyScore,subject
0,9.338548e+05,403511.353385,366049.581299,147694.077454,97693.941009,85014.946657,42651.704821,10897.795443,12787.725347,5711.292708,...,19558.888001,106929.215537,18798.114040,88131.101497,16437.931160,1676.926914,18114.858075,1229.997937,0.788825,OAS2_0174_MR3_mpr-2
1,1.203568e+06,491799.548874,501640.218371,167714.300432,115792.678084,111471.221305,57503.604062,12803.503346,15162.314225,8366.798573,...,26514.241645,138131.237943,25875.232678,112256.005265,31510.722390,2368.757879,33879.480269,3632.712814,0.779730,OAS2_0171_MR1_mpr-1
2,1.104226e+06,448775.689072,454524.368778,155549.463053,103996.565687,106205.828574,50029.634444,11167.050429,15460.381210,7787.471602,...,21827.146885,127998.675461,25965.784673,102032.890787,63909.471735,4158.874788,68068.346522,4924.107748,0.648719,OAS2_0175_MR1_mpr-3
3,1.016428e+06,435742.375738,410560.346025,147679.328455,103617.207706,101083.466836,50299.918430,12084.918383,12795.271346,6126.665687,...,20977.535928,112441.225255,19451.186006,92990.039249,63302.018766,3342.534829,66644.553595,9640.357507,0.666633,OAS2_0185_MR1_mpr-1
4,1.090008e+06,441127.475463,450981.178960,153295.610168,102065.475786,103895.723692,49113.824491,11057.290435,15206.561223,7982.638592,...,21699.550891,125314.357598,23159.701817,102154.655781,67646.456544,4696.355760,72342.812304,8218.279580,0.611019,OAS2_0175_MR2_mpr-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,8.961049e+05,363062.051451,374342.291875,131521.628281,90025.147401,77797.541025,34697.192227,10929.694442,10386.382469,7658.160609,...,18090.848076,103575.018708,16930.479135,86644.539573,43948.930755,2862.677854,46811.608608,7125.824636,0.694184,OAS2_0080_MR1_mpr-1
408,9.414170e+05,377647.096706,392722.631936,135996.063052,81806.524821,92891.941254,38174.183050,11037.739436,13058.352333,8467.640567,...,17740.645094,108759.806443,20675.352944,88084.453500,48239.174535,3038.293845,51277.468380,18740.833043,0.662050,OAS2_0178_MR3_mpr-3
409,9.471591e+05,368961.994150,388520.882151,132221.691245,84800.228668,83827.137717,40666.763922,10323.613473,12745.536349,7263.710629,...,17122.559125,127257.452498,30303.362452,96954.090047,54792.189201,4205.179785,58997.368986,8241.946579,0.627158,OAS2_0046_MR1_mpr-3
410,8.559836e+05,353388.765945,354143.708907,123952.990667,80626.261881,80796.389872,42397.541834,9385.508520,11505.248412,5826.883702,...,16230.073171,95731.981109,18075.756077,77656.225033,27322.006604,1952.012900,29274.019504,2830.092855,0.770967,OAS2_0027_MR3_mpr-1
