# Some preprocessing on UKBioBank  
For easy loading, a fold should include all the data, e.g.,  
* DataFold//  
  subject1.txt  
  subject2.txt  
  ...

In [1]:
from pandas import read_csv
import numpy as np
import os

## 1. Data Processing

In [2]:
def triu_mask(n):
    """Upper triangular mask of a n*n matrix"""
    m = np.ones((n, n))
    m = np.triu(m, k=1) > 0.5 # do not include diagonal
    return m

def compute_tc_fc_fcu(fn, good_components, mask):
    tc100 = np.loadtxt(fn) # oringinal data
    tc55 = tc100[:, good_components]
    fc55 = np.corrcoef(tc55.T) # Pearson's Correlation
    fcu55 = fc55[mask] #  Upper triangular part
    return tc100, tc55, fc55, fcu55

In [3]:
# Example, about triu_mask
x = np.arange(16).reshape(4,4)
print(x)
mask = triu_mask(x.shape[0])
print(x[mask])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[ 1  2  3  6  7 11]


#### Note,
* $\textbf{sex}$ is a $n \times 1$ vector, 0 means woman, 1 means man, and some subject miss this information and represented by $\textbf{nan}$  
* $\textbf{age}$ is a $n \times 3$ matrix (I guess age of a subject maybe updated and stored in last two columns)  

Let's see a example

In [4]:
f_nan = lambda x: -1 if str(x) == 'nan' else int(x)

def save_sex_age(path, ids, inf):
    total_ids = inf.eId.values
    sex = inf.sex.values
    age = inf.age.values

    y_sex, y_age = [], []
    for i in ids:
        pos = np.where(total_ids == i)[0] # find position
        if len(pos): # exist
            y_sex.append(f_nan(sex[pos[0]])) # woman: 0, man: 1, nan: -1
            y_age.append(max(list(map(f_nan, age[p[0]])))) # newest age
    np.savetxt(os.path.join(path, 'sex.txt'), y_sex, fmt='%d')
    np.savetxt(os.path.join(path, 'age.txt'), y_age, fmt='%d')

In [5]:
# sex
sex = [0, 1, 'nan']
[print(f_nan(v)) for v in sex]
print()

# age
age = np.array([\
    [45, 'nan', 'nan'],\
        [48, 53, 'nan'],\
            [47, 52, 70]])
[print(max(list(map(f_nan, v)))) for v in age]
print()

0
1
-1

45
53
70



In [None]:
good_components = np.loadtxt(path_to_good_componets) - 1 # start from 0 in Numpy
inf = read_csv(path_to_information) # UK information file
mask = triu_mask(55)

def process(path1, path2, good_components, inf, mask):
    """extract and rename files in path1 to path2"""
    types = ['tc100', 'tc55', 'fc55', 'fcu55']
    raw_fns = os.listdir(path1) # path1 includes all subjects
    target = '/rfMRI/rfMRI_100.dr/dr_stage1.txt'
    sub_ids = []

    for rf in raw_fns:
        sub_id = rf.split('_')[0] # id
        fn = os.path.join(path1, rf, target)
        if os.path.exists(fn):
            sub_ids.append(sub_id)
            data = compute_tc_fc_fcu(fn, good_components, mask)
            for t,d in zip(types, data):
                to_path = os.path.join(path2, t, '{}.txt'.format(sub_id))
                np.savetxt(to_path, d, fmt='%.10e')
    save_sex_age(path2, sub_ids, inf)

## 2. Load Data  
Why I do this? Suppose you want perform age classification, and if you have:  
* a fold contains all the files 
* sub_ids and its corresponding  
* ages  

then, based on their ID one can load its corresponding data, so that data and labels are matched. Otherwise, some error may occur (e.g., load wrong age)

In [6]:
labels = [1,1,0,0]
data = np.vstack([np.random.randn(14) for i in range(4)]) # load txt

In [7]:
data = [(x,y) for x,y in zip(data, labels)]