In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
data_filename = 'data/btsv01.txt'
rois = pd.read_csv(data_filename, delim_whitespace=True)
labels_train_filename = 'data/training_fluid_intelligenceV1.csv'
labels_val_filename = 'data/validation_fluid_intelligenceV1.csv'
labels_train = pd.read_csv(labels_train_filename, sep=',')
labels_val = pd.read_csv(labels_val_filename, sep=',')
print("Dataset shape: {}".format(rois.shape))
print("Labels train shape: {}".format(labels_train.shape))
print("Labels val shape: {}".format(labels_val.shape))

Dataset shape: (7803, 131)
Labels train shape: (3739, 2)
Labels val shape: (415, 2)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
columns_description = rois.iloc[0]
print(columns_description)

#delete the row with description of the columns from data
rois_clean = rois.drop(rois.index[[0]])

collection_id                                                      collection_id
btsv01_id                                                              btsv01_id
dataset_id                                                            dataset_id
subjectkey                     The NDAR Global Unique Identifier (GUID) for r...
src_subject_id                        Subject ID how it's defined in lab/project
interview_date                 Date on which the interview/genetic test/sampl...
interview_age                  Age in months at the time of the interview/tes...
gender                                                        Sex of the subject
sri24precentrallgm                      left precentral gyrus gray matter volume
sri24precentralrgm                     right precentral gyrus gray matter volume
sri24frontalsuplgm             left superior frontal gyrus, dorsolater gray m...
sri24frontalsuprgm             right superior frontal gyrus, dorsolater gray ...
sri24frontalsuporblgm       

In [4]:
print("Number of rows: {}".format(rois_clean.shape[0]))
print("Number of columns: {}".format(rois_clean.shape[1]))

Number of rows: 7802
Number of columns: 131


Here, we will explore the the types of columns to see whether we can skip anything and to get some overview of the data. 
- btsv01_id- all unique values (1 per row) - don't tell us anything; they are descriptive 
- interview_date - the date when the interview was performed - also useless 
- collection_id / dataset_id - identification of the collection/dataset - useless
- collection_title - useless

In general, we have one row per participant.

In [5]:
len(set(labels_train['subject']).intersection(set(rois_clean['subjectkey'])))

3739

Now, we found that our labels with residual fluid intelligence scores are a subset of values in a column 'subjectkey'.
We are constructing a training set, therefore in the training set we will only consider those entries that also have labels. <br>
Furthermore, we are eliminating the following columns: 
- btsv01_id
- interview_date
- collection_id
- dataset_id
- collection_title
- src_subject_id (which is the repetition of subjectkey, but with lowercase letters (for some mysterious reason))

In [6]:
rois_clean = rois_clean.drop(['btsv01_id', 'interview_date', 'collection_id', \
                              'dataset_id', 'collection_title', 'src_subject_id'], axis=1)

In [7]:
#now we are changing the string values of gender to number
gender_to_int = {
    'M': 0,
    'F': 1
}

rois_clean['gender'] = rois_clean['gender'].apply(lambda x: gender_to_int[x])

In [8]:
train_data = rois_clean[rois_clean['subjectkey'].isin(labels_train['subject'])]
val_data = rois_clean[rois_clean['subjectkey'].isin(labels_val['subject'])]

In [9]:
train_data_labels = pd.merge(train_data, labels_train, left_on='subjectkey', right_on='subject').drop(['subject'], axis=1)
val_data_labels = pd.merge(val_data, labels_val, left_on='subjectkey', right_on='subject').drop(['subject'], axis=1)

In [10]:
train_data_labels.shape

(3739, 126)

In [21]:
#here we will construct a matrix only with data - without labels
#the first column is identification - it is a string and we want to delete it. Other columns are identified as string,
#but we want to convert it to floats and to the matrix
train_data_matrix = train_data_labels.values[:, 1:-1].astype(np.float)
val_data_matrix = val_data_labels.values[:, 1:-1].astype(np.float)

In [23]:
#now for the final form - train_data/val_data will be our matrices; train_labels, val_labels will be our residuals

train_data = train_data_matrix
val_data = val_data_matrix

train_labels = train_data_labels.values[:,-1].astype(np.float)
val_labels = val_data_labels.values[:,-1].astype(np.float)

In [25]:
len(train_labels)

3739