# Data Separation iPython
ADNI data separation for the SP8Bundle use case

In [31]:
cd ~/development/Sp8Bundle/

/home/torcato/development/Sp8Bundle


In [32]:
ls

[0m[01;34mADNI[0m/  [01;31mADNI.zip[0m  [01;34mBrescia[0m/  [01;31mBrescia.zip[0m


In [33]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import datetime
import time
import os
import glob
import csv

%matplotlib inline

In [34]:
cd ADNI/ADNI/Output/

/home/torcato/development/Sp8Bundle/ADNI/ADNI/Output


In [35]:
ls

[0m[01;34madni0[0m/           brain_feature.csv      diagnostic_codes.csv  genetic_set.csv
[01;34madni1[0m/           brain_feature_set.csv  diagnostic.csv        genetic_snp.csv
[01;34madni2[0m/           brain_region.csv       exam_measurement.csv  patient.csv
brain_atlas.csv  brain_scan.csv         exam_value.csv


In [36]:
print glob.glob("*.csv")

['brain_feature_set.csv', 'genetic_set.csv', 'patient.csv', 'brain_feature.csv', 'brain_scan.csv', 'exam_value.csv', 'brain_region.csv', 'genetic_snp.csv', 'exam_measurement.csv', 'brain_atlas.csv', 'diagnostic.csv', 'diagnostic_codes.csv']


## Opens all files in the input folder

In [38]:
data_in = {}

for f in glob.glob("*.csv"):
    basename = os.path.basename(f)
    name =basename[:-4] 
    data_in[name] = pd.read_csv(f)

## Correcting exam_values
Selects all values that are string or finite

corrects all baseline values

Adds age, education and gender 

In [39]:
exams=data_in['exam_value']
print "start length" , len(exams)
exams=exams[exams.value.map( lambda x :  isinstance(x, str) or np.isfinite(x) )]
exams=exams[exams.variable_name.map( lambda x: not x.endswith('_bl') )]
print "cleaned lenght", len(exams)

start length 813685
cleaned lenght 205359


In [40]:
def create_exam_value(var_name, ids, values,  dates):
    n = len(ids)
    df = pd.DataFrame()
    columns = ['patient_id',
             'exam_measurement_id',
             'value',
             'variable_name',
             'status',
             'exam_date',
             'extracted_from',
             'extraction_method',
             'extraction_method_version',
             'anonymization_method',
             'anonymization_method_version',
             'description',
             'record_creation']

    for name in columns:
        if name == 'patient_id':
            df[name]= ids
        elif name == 'exam_measurement_id' or name == 'variable_name':
            df[name]=[var_name]*n
        elif name == 'value':
            df[name]=values
        elif name == 'extracted_from':
            df[name]=['ADNI']*n    
        elif name == 'record_creation':
            df[name] = [datetime.datetime.now()] *n
        elif name == 'exam_date':
            df[name] = dates
        else:
            df[name]=['null']*n
    return df

In [41]:
adnimerge = pd.read_csv("../InputFIles/ADNIMERGE.csv")
adni_bl = adnimerge[adnimerge.VISCODE == 'bl']
adni_bl.RID = adni_bl.RID.map(lambda x: "adni_%d" % x)
# adds an APOE4_bl
adni_bl['APOE4_bl'] = adni_bl['APOE4'] 
create_exam_value('Age', adni_bl.RID, adni_bl.AGE, adni_bl.EXAMDATE_bl ).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,patient_id,exam_measurement_id,value,variable_name,status,exam_date,extracted_from,extraction_method,extraction_method_version,anonymization_method,anonymization_method_version,description,record_creation
0,adni_2,Age,74.3,Age,,2005-09-08,ADNI,,,,,,2015-12-10 10:49:17.636915
1,adni_3,Age,81.3,Age,,2005-09-12,ADNI,,,,,,2015-12-10 10:49:17.636915
5,adni_4,Age,67.5,Age,,2005-11-08,ADNI,,,,,,2015-12-10 10:49:17.636915
10,adni_5,Age,73.7,Age,,2005-09-07,ADNI,,,,,,2015-12-10 10:49:17.636915
15,adni_6,Age,80.4,Age,,2005-11-29,ADNI,,,,,,2015-12-10 10:49:17.636915


In [42]:
# all baseline columns
cols = [ col for col in adni_bl.columns if col.endswith('_bl')]
cols.append('AGE')
cols.append('PTGENDER')
cols.append('PTEDUCAT')


for name in cols:
    df = create_exam_value(name, adni_bl.RID, adni_bl[name], adni_bl.EXAMDATE_bl )
    exams=exams.append(df)

In [43]:
data_in['exam_value'] = exams

len(data_in['exam_value'] )

278313

## Divides patients in n partitions

In [44]:
data_out = []
partitions = 3
length = len(data_in['patient'])/partitions

for n in range(partitions):
    patient = data_in['patient'][n*length:(n+1)*length]
    data_out.append(dict(patient=patient))
    
# appends the rest if there is any
patient = data_in['patient'][(n+1)*length:]
data_out[n]['patient']=data_out[n]['patient'].append(patient)

## From here on will separate the data in the other files by patient partitions

In [45]:
data_in.keys()

['exam_measurement',
 'patient',
 'brain_feature',
 'diagnostic_codes',
 'genetic_set',
 'brain_feature_set',
 'brain_atlas',
 'exam_value',
 'brain_scan',
 'brain_region',
 'diagnostic',
 'genetic_snp']

In [46]:
def by_pid(din, dout, name):
    """Creates new dataframe in dout from din[name] with the corresponding patients"""
    ids = list(dout['patient'].id)
    criteria = din[name].patient_id.map(lambda x: x in ids)
    dout[name]= din[name][criteria]

In [47]:
tables = ['brain_feature',         
         'genetic_set',
         'brain_feature_set',         
         'exam_value',                 
         'diagnostic',
         'genetic_snp']

for dout in data_out:
    for t in tables:
        by_pid(data_in, dout, t) 

In [48]:
# these are common tables
tables = ['diagnostic_codes',         
         'brain_atlas',         
         'brain_region']

for dout in data_out:
    for t in tables:
        dout[t] = data_in[t]

In [49]:
dout.keys()

['patient',
 'diagnostic_codes',
 'genetic_set',
 'diagnostic',
 'brain_atlas',
 'exam_value',
 'brain_feature',
 'brain_region',
 'brain_feature_set',
 'genetic_snp']

## Saves all the data

In [50]:
start = time.time()
for n,data in enumerate(data_out):
    partial_time = time.time()
    directory = "adni%d" % n
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for name in data:
        filename = 'adni%d/%s.csv' %( n, name)
        print "file %s with %d rows " % (filename, len(data[name]))
        data[name].to_csv( filename, index=False)
    print "partial time", time.time()-partial_time,"total time", time.time()-start

file adni0/patient.csv with 579 rows 
file adni0/diagnostic_codes.csv with 13413 rows 
file adni0/genetic_set.csv with 131 rows 
file adni0/diagnostic.csv with 3984 rows 
file adni0/brain_atlas.csv with 1 rows 
file adni0/exam_value.csv with 93194 rows 
file adni0/brain_feature.csv with 15196 rows 
file adni0/brain_region.csv with 116 rows 
file adni0/brain_feature_set.csv with 131 rows 
file adni0/genetic_snp.csv with 77000 rows 
partial time 0.639266014099 total time 0.639656066895
file adni1/patient.csv with 579 rows 
file adni1/diagnostic_codes.csv with 13413 rows 
file adni1/genetic_set.csv with 112 rows 
file adni1/diagnostic.csv with 3790 rows 
file adni1/brain_atlas.csv with 1 rows 
file adni1/exam_value.csv with 92278 rows 
file adni1/brain_feature.csv with 12992 rows 
file adni1/brain_region.csv with 116 rows 
file adni1/brain_feature_set.csv with 112 rows 
file adni1/genetic_snp.csv with 73000 rows 
partial time 0.610282897949 total time 1.24999213219
file adni2/patient.csv 

In [51]:
print "Total number of subjects", len(data_in['patient'])
for n,d in enumerate(data_out):
    print "daset %d with %d subjects" %(n, len(d['patient']))

Total number of subjects 1737
daset 0 with 579 subjects
daset 1 with 579 subjects
daset 2 with 579 subjects


In [59]:
for v in data_out[0]['exam_value'].variable_name.unique():
    print v


ADAS11
ADAS13
APOE
CDR
DX
Entorhinal
FAQ
FDG
Fusiform
Hippocampus
ICV
MMSE
MidTemp
PIB
RAVLT_forgetting
RAVLT_immediate
RAVLT_learning
RAVLT_perc_forgetting
Ventricles
WholeBrain
AV45
EcogPtDivatt
EcogPtLang
EcogPtMem
EcogPtOrgan
EcogPtPlan
EcogPtTotal
EcogPtVisspat
EcogSPDivatt
EcogSPLang
EcogSPMem
EcogSPOrgan
EcogSPPlan
EcogSPTotal
EcogSPVisspat
MOCA
DX_bl
EXAMDATE_bl
CDRSB_bl
ADAS11_bl
ADAS13_bl
MMSE_bl
RAVLT_immediate_bl
RAVLT_learning_bl
RAVLT_forgetting_bl
RAVLT_perc_forgetting_bl
FAQ_bl
Ventricles_bl
Hippocampus_bl
WholeBrain_bl
Entorhinal_bl
Fusiform_bl
MidTemp_bl
ICV_bl
MOCA_bl
EcogPtMem_bl
EcogPtLang_bl
EcogPtVisspat_bl
EcogPtPlan_bl
EcogPtOrgan_bl
EcogPtDivatt_bl
EcogPtTotal_bl
EcogSPMem_bl
EcogSPLang_bl
EcogSPVisspat_bl
EcogSPPlan_bl
EcogSPOrgan_bl
EcogSPDivatt_bl
EcogSPTotal_bl
FDG_bl
PIB_bl
AV45_bl
Years_bl
Month_bl
APOE4_bl
AGE
PTGENDER
PTEDUCAT
