# Data Prepro
CPACでPreProしたものは複数回被験者がいない\
CPACのPreProにいて、RevisedNew_Shimane3TsubjectsData.xlsxにいない被験者が１名(D6115)\
1回被験者でCPACにいない被験者が2名(D5389,D6190)\
すべてを考慮した結果被験者は616名

## Install and Import

In [1]:
"""%%bash
python3 -m pip install --user numpy
python3 -m pip install --user pandas
python3 -m pip install --user tqdm
python3 -m pip install --user ipywidgets
python3 -m pip install --user matplotlib
python3 -m pip install --user xlrd
python3 -m pip install --user openpyxl"""

'%%bash\npython3 -m pip install --user numpy\npython3 -m pip install --user pandas\npython3 -m pip install --user tqdm\npython3 -m pip install --user ipywidgets\npython3 -m pip install --user matplotlib\npython3 -m pip install --user xlrd\npython3 -m pip install --user openpyxl'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Data load

### time series data load

In [3]:
data_path = Path('../01_data/roi_timeseries')

data_files_aal = (data_path).rglob('_selector_CSF-2mm-M_aC-CSF+WM-2mm-DPC5_M-SDB_P-2_BP-B0.01-T0.1/\
_mask_aal_mask_pad_mask_file_..resources..aal_mask_pad.nii.gz/roi_stats.csv')

data_files_ho = (data_path).rglob('_selector_CSF-2mm-M_aC-CSF+WM-2mm-DPC5_M-SDB_P-2_BP-B0.01-T0.1/\
_mask_ho_mask_pad_mask_file_..resources..ho_mask_pad.nii.gz/roi_stats.csv')

In [4]:
#ファイル名からcsvファイルを取得＋subIDを付加
def file2df(data_files):
    df_list = []
    for t in tqdm(data_files):
        df_temp = pd.read_csv(t,header = 1)
        
        t_ = str(t)
        fd0000 = t_.find('0000')
        fdD    = t_.find('D',fd0000)
        fd_    = t_.find('_',fd0000)
        
        s = t_[fdD:fd_]
        df_temp.insert(0,'subID',s)
        
        df_list.append(df_temp)
    return pd.concat(df_list)

#columnsに付いている無駄なスペースを削除
def rem_space_columns(df):
    lis_new = []
    for i in df.columns:
        lis_new.append(i.replace(' ',''))
    df.set_axis(lis_new, axis='columns',inplace = True)
    #df.rename(columns={key:val for key, val in zip(df.columns,lis_new)},inplace = True) 
    return df

In [5]:
df_aal = file2df(data_files_aal)
df_aal = rem_space_columns(df_aal)
df_aal.reset_index(inplace=True)

df_ho = file2df(data_files_ho)
df_ho = rem_space_columns(df_ho)
df_ho.reset_index(inplace=True)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

### ROI label data load

In [6]:
df_labels_aal = pd.read_csv('../01_data/roi_atlas/aal_labels.csv',header = 1)
df_labels_aal.set_axis(['number','roi'], axis='columns',inplace = True)
df_labels_ho = pd.read_csv('../01_data/roi_atlas/ho_labels.csv',header = 1)
df_labels_ho.set_axis(['number','roi'], axis='columns',inplace = True)

def change_labels_name(df_labels):
    lis = list(df_labels.roi)
    lis_new = []
    for i in lis:
        i = i.replace(' ','_')
        i = i.replace(';','')
        i = i.replace("'","_")
        i = i.replace('(','')
        i = i.replace(')','')
        lis_new.append(i)
    df_labels.roi = lis_new
    return df_labels

df_labels_aal = change_labels_name(df_labels_aal)
df_labels_ho = change_labels_name(df_labels_ho)
df_ho.drop('Mean_3455', axis=1, inplace=True)

### change timeseries name

In [7]:
def change_columns_name(df,df_labels):
    lis = list(df.columns)
    for num in df_labels.number:
        temp = [str(num) in i for i in df.columns]
        lis[temp.index(True)] = df_labels[df_labels.number == num].roi.iloc[0]
        df.set_axis(lis, axis='columns',inplace = True)
    return df

df_aal = change_columns_name(df_aal,df_labels_aal)
df_ho  = change_columns_name(df_ho ,df_labels_ho)

### subjects data load

In [5]:
df_subjects_data = pd.read_excel('../01_data/shimane_basicInfo/RevisedNew_Shimane3TsubjectsData.xlsx',engine='openpyxl')
df_subjects_data.drop('Unnamed: 102',axis=1,inplace=True)

## ROI Standardization

In [9]:
#標準化前
print(np.mean(df_aal[df_aal.subID == 'D5340'].Precentral_L))
print(np.std(df_aal[df_aal.subID == 'D5340'].Precentral_L))

0.025446614285714323
9.0354087866445


In [10]:
def standardization(df):
    for subID in tqdm(df.subID.unique()):
        df_ = df[df['subID'] == subID]
        df_ = df_.drop(['index','subID'],axis=1)
        data = np.array(df_)
        data -= np.mean(data)
        data /= np.std(data)
        roi_name = [i for i in list(df.columns).copy() if i != 'index' and i != 'subID']
        df.loc[df['subID'] == subID,roi_name] = data
    return df

In [11]:
df_aal = standardization(df_aal)
df_ho  = standardization(df_ho)

df_aal.rename(columns={'index': 'time'}, inplace=True)
df_ho.rename(columns={'index': 'time'}, inplace=True)

  0%|          | 0/617 [00:00<?, ?it/s]

  0%|          | 0/617 [00:00<?, ?it/s]

In [12]:
#標準化後
print(np.mean(df_aal[df_aal.subID == 'D5340'].Precentral_L))
print(np.std(df_aal[df_aal.subID == 'D5340'].Precentral_L))
#-1.2688263138573217e-17
#1.0

-0.0018505632030283263
0.5405966433109262


In [13]:
df_aal.shape
df_ho.shape
df_labels_aal.shape
df_labels_ho.shape

(86380, 118)

(86380, 112)

(116, 2)

(110, 2)

## Dynamic FC

In [14]:
def dynamic_FC_fast(df,window_size = 18,extra = 2):
    '''
    window_size：TR=2.5より、45秒間
    extra：dfのcolumnsの最初のindex,subIDを無視
    dynamic_FC：被験者あたり２４分かかる(ROIの数に依る)
    dynamic_FC_fast：被験者あたり２分以下で終わる(ROIの数に依る)
    '''
    
    lis_col = list(df.columns)[extra:]#ROI名
    lis_col_new = list(df.columns)[:extra]#dynamic_FCのcolumns
    
    for m in range(len(lis_col)):
        for n in range(m+1,len(lis_col)):
            lis_col_new.append(lis_col[m] + '_' + lis_col[n])
    
    lis = df.iloc[:,2:].values.tolist()#全データをlistに変換
    num_take = int(len(lis)/len(df.subID.unique()))#一人あたり撮像回数
    sub_list = list(df.subID.unique())
    
    lis_dynamic = []#ある被験者についてのdynamic FC情報
    
    for c, subID in enumerate(tqdm(sub_list)):
        lis_temp = lis[c*num_take:c*num_take+num_take]#ある被験者についてのtimeseries情報
        for i in range(len(lis_temp)-window_size+1):
            lis_dynamic_temp = lis_temp[i:i+window_size]
            lis_df = []#計算後DataFrameとなるlist
            lis_df.append(f'{i}-{i+window_size-1}')
            lis_df.append(f'{subID}')
            for j in range(len(lis_col)):
                for k in range(j+1,len(lis_col)):
                    lis_dynamic_temp1 = []
                    lis_dynamic_temp2 = []
                    for l in lis_dynamic_temp:
                        lis_dynamic_temp1.append(l[j])
                        lis_dynamic_temp2.append(l[k])
                    corr = np.corrcoef(lis_dynamic_temp1,lis_dynamic_temp2)[0,1]
                    if np.isnan(corr):
                        lis_df.append(0)
                    else:
                        lis_df.append(corr)
            lis_dynamic.append(lis_df)
    df_dynamic = pd.DataFrame(lis_dynamic,columns=lis_col_new)
    return df_dynamic

In [15]:
#1日程度の時間がかかる
df_dynamic_aal = dynamic_FC_fast(df_aal)
df_dynamic_aal.shape
df_dynamic_aal.rename(columns={'index': 'time'}, inplace=True)
df_dynamic_aal.reset_index(inplace=True)
df_dynamic_aal.drop('index',axis=1,inplace=True)

  0%|          | 0/617 [00:00<?, ?it/s]

  c /= stddev[:, None]
  c /= stddev[None, :]


(75891, 6672)

In [6]:
#1日程度の時間がかかる
df_dynamic_ho = dynamic_FC_fast(df_ho)
df_dynamic_ho.shape
df_dynamic_ho.rename(columns={'index': 'time'}, inplace=True)
df_dynamic_ho.reset_index(inplace=True)
df_dynamic_ho.drop('index',axis=1,inplace=True)

  0%|          | 0/617 [00:00<?, ?it/s]

##　存在する被験者のデータのみにsubjects_infoを絞る

In [16]:
def subjects_once(df = df_aal, df_sub = df_subjects_data):
    sub_lis = list(df[['subID']].drop_duplicates()['subID'])
    for i in range(df_sub.shape[0]):
        subID = df_sub['subID'][i][df_sub['subID'][i].find('D'):]
        if subID in sub_lis:
            df_sub.loc[i,'subID'] = df_sub['subID'][i][df_sub['subID'][i].find('D'):]
    return df_sub[[False if i[0]=='2' else True for i in df_sub['subID']]]#入ってないやつは2から始まる

def subjects_not_in(df,df_sub=df_subjects_data):
    lis_not_in = list(set(list(df[['subID']].drop_duplicates()['subID'])) - set(df_sub.subID))
    return df[[False if i in lis_not_in else True for i in df['subID']]]

In [17]:
df_subjects_data = df_subjects_data.rename(columns={'NewID': 'subID'})
df_subjects_data = subjects_once(df = df_aal, df_sub = df_subjects_data)
df_subjects_data = df_subjects_data.sort_values('subID').reset_index(drop=True)

df_aal = subjects_not_in(df=df_aal,df_sub=df_subjects_data)
df_aal = df_aal.sort_values(['subID','time']).reset_index(drop=True)

df_ho = subjects_not_in(df=df_ho,df_sub=df_subjects_data)
df_ho = df_ho.sort_values(['subID','time']).reset_index(drop=True)

In [18]:
df_dynamic_aal = subjects_not_in(df=df_dynamic_aal,df_sub=df_subjects_data)
df_dynamic_aal = df_dynamic_aal.sort_values(['subID','time']).reset_index(drop=True)

In [None]:
df_dynamic_ho = subjects_not_in(df=df_dynamic_ho, df_sub=df_subjects_data)
df_dynamic_ho = df_dynamic_ho.sort_values(['subID','time']).reset_index(drop=True)

## randomに被験者の順番を並べ替える

In [82]:
def sort_df(df):
    np.random.seed(0)
    sort = np.array(df.subID.drop_duplicates())
    sort = np.random.choice(sort, len(sort), replace=False)
    sort = list(sort)
    
    if 'time' in df.columns:
        df['sort'] = df['subID'].apply(lambda x: sort.index(x) if x in sort else -1)
        df = df.sort_values(['sort','time']).reset_index(drop=True).drop('sort', axis=1)
    else:
        df['sort'] = df['subID'].apply(lambda x: sort.index(x) if x in sort else -1)
        df = df.sort_values(['sort']).reset_index(drop=True).drop('sort', axis=1)
    return df

In [83]:
%%time
df_aal = sort_df(df_aal)
df_ho  = sort_df(df_ho)
df_subjects_data = sort_df(df_subjects_data)
df_dynamic_aal = sort_df(df_dynamic_aal)
df_dynamic_ho  = sort_df(df_dynamic_ho)

CPU times: user 9.73 s, sys: 4.72 s, total: 14.4 s
Wall time: 14.5 s


## dynamic FCのデータが大きすぎるので、float16に変換

In [85]:
def transform_float(df):
    keys = list(df.columns)
    keys.remove('time')
    keys.remove('subID')
    values = ['float16']*len(keys)
    d = dict(zip(keys, values))
    df = df.astype(d)
    return df

In [86]:
%%time
df_dynamic_aal = transform_float(df_dynamic_aal)
df_dynamic_ho  = transform_float(df_dynamic_ho)

CPU times: user 9.82 s, sys: 22.3 ms, total: 9.84 s
Wall time: 9.85 s


## 外れ値除去

In [None]:
%%time
df_aal = df_aal[df_aal['subID'] != df_subjects_data[df_subjects_data['Age'] == 20]['subID'].iloc[0]]
df_ho  = df_ho[df_ho['subID'] != df_subjects_data[df_subjects_data['Age'] == 20]['subID'].iloc[0]]

In [40]:
%%time
df_dynamic_aal = df_dynamic_aal[df_dynamic_aal['subID'] != df_subjects_data[df_subjects_data['Age'] == 20]['subID'].iloc[0]]
df_dynamic_ho  = df_dynamic_ho[df_dynamic_ho['subID'] != df_subjects_data[df_subjects_data['Age'] == 20]['subID'].iloc[0]]

CPU times: user 1.34 s, sys: 1.55 s, total: 2.89 s
Wall time: 2.89 s


In [41]:
df_subjects_data = df_subjects_data[df_subjects_data['subID'] != df_subjects_data[df_subjects_data['Age'] == 20]['subID'].iloc[0]]

## Data 保存

In [45]:
%%time
df_aal.to_csv('../02_data_analysis/roi_timeseries/timeseries_aal.csv', index=False)
df_ho.to_csv('../02_data_analysis/roi_timeseries/timeseries_ho.csv', index=False)
df_labels_aal.to_csv('../02_data_analysis/roi_labels/labels_aal.csv', index=False)
df_labels_ho.to_csv('../02_data_analysis/roi_labels/labels_ho.csv', index=False)
df_subjects_data.to_csv('../02_data_analysis/subjects_info/subjects_info.csv', index=False)

CPU times: user 37.4 s, sys: 310 ms, total: 37.7 s
Wall time: 38.1 s


In [46]:
%%time
df_dynamic_aal.to_csv('../02_data_analysis/dynamic_FC/dynamic_aal.csv', index=False)

CPU times: user 10min 44s, sys: 2.6 s, total: 10min 46s
Wall time: 10min 48s


In [47]:
%%time
df_dynamic_ho.to_csv('../02_data_analysis/dynamic_FC/dynamic_ho.csv', index=False)

CPU times: user 9min 46s, sys: 2.26 s, total: 9min 48s
Wall time: 9min 50s


## Data load

In [48]:
%%time
df_aal = pd.read_csv('../02_data_analysis/roi_timeseries/timeseries_aal.csv')
df_ho = pd.read_csv('../02_data_analysis/roi_timeseries/timeseries_ho.csv')
df_labels_aal = pd.read_csv('../02_data_analysis/roi_labels/labels_aal.csv')
df_labels_ho = pd.read_csv('../02_data_analysis/roi_labels/labels_ho.csv')
df_subjects_data = pd.read_csv('../02_data_analysis/subjects_info/subjects_info.csv')

In [49]:
%%time
df_dynamic_aal = pd.read_csv('../02_data_analysis/dynamic_FC/dynamic_aal.csv')
df_dynamic_ho = pd.read_csv('../02_data_analysis/dynamic_FC/dynamic_ho.csv')

In [50]:
print(len(df_aal.subID.drop_duplicates()))
print(len(df_ho.subID.drop_duplicates()))
print(len(df_dynamic_ho.subID.drop_duplicates()))
print(len(df_dynamic_aal.subID.drop_duplicates()))
print(len(df_subjects_data.subID.drop_duplicates()))

615
615
615
615
615


In [52]:
df_aal.shape
df_ho.shape
df_dynamic_ho.shape
df_dynamic_aal.shape
df_subjects_data.shape

(86100, 118)

(86100, 112)

(75645, 5997)

(75645, 6672)

(615, 102)