##### This notebook generates meta data and counts file for VDAART patients

In [1]:
## Load libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pyreadr
import os

In [2]:
PATH = os.path.join(os.getcwd())
PATH_DATA = os.path.join(PATH, 'Data')

try: 
    os.makedirs(PATH_DATA)
except OSError:
    if not os.path.isdir(PATH_DATA):
        raise


#### Load counts data

In [3]:
## id repeated --> 110188
## remove second column (443 cols)
counts = pd.read_csv(os.path.join( PATH, "vdaart", "merged_counts.txt"),sep=" ")
counts.columns = [col[:6] for col in counts.columns]
ind = [ind for ind in counts.index if ind.startswith('ENSG')]
counts = counts.loc[ind,]
counts = counts.iloc[:, 1:]
print('There are a total of {} patients'.format(counts.shape[1]))
print('Total probes:', counts.shape[0])
counts.head()

There are a total of 443 patients
Total probes: 65988


Unnamed: 0,110188,111485,111733,112434,113223,113321,113402,114430,114469,114718,...,394540,394765,394806,394913,395879,396641,398087,399652,399773,399891
ENSG00000000003,1,1,2,3,2,4,0,3,6,2,...,1,9,6,2,2,10,2,2,1,6
ENSG00000000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000000419,61,78,182,79,32,49,13,89,42,83,...,38,56,99,30,45,136,33,87,86,79
ENSG00000000457,178,141,295,106,58,91,39,163,64,147,...,90,151,222,87,85,208,77,140,140,167
ENSG00000000460,80,52,148,34,21,47,28,84,51,62,...,38,112,63,30,37,107,42,56,46,101


#### Load metadata of experiments

In [4]:
## basevitdng (10-18 weeks) --> enrollment
## wk32to38vitdng --> third trimester
## Use mayocbloodvitd instead of cbloodvitdng

meta = pd.read_csv(os.path.join(PATH, "vdaart","vitdfinalDec2016.csv")).set_index(['vid'])
meta = meta[['basevitdng', 'wk32to38vitdng', 'mayocbloodvitd']]
meta.index = meta.index.astype(str)
meta = meta.loc[counts.columns]


### Add condition column based on Vitamin D
meta['condition'] = np.NaN

## If mother's vit.D at the start and third trimester, and cord blood vitmain D is <25 then low, else if >=25 then high
meta.loc[(meta.basevitdng >= 25 ) & (meta.wk32to38vitdng >= 25 )  & (meta.mayocbloodvitd >= 25 ), 'condition'] =  1#'high'
meta.loc[(meta.basevitdng < 25 ) & (meta.wk32to38vitdng < 25 )  & (meta.mayocbloodvitd < 25 ), 'condition'] = 0 #'low'


# print('Patients without cord blood values: ', meta[meta.condition.isna()].shape[0])
print('Low:', meta[meta.condition==0].shape[0], 'patients')
print('High:', meta[meta.condition==1].shape[0], 'patients')


Low: 95 patients
High: 97 patients


In [5]:
## Save metadata
# meta.to_csv(os.path.join(PATH_DATA, 'metadata.tsv'), sep='\t')
# meta.head()

- ##### Patients with cord blood Vitamin D, mother's base Vit.D, Vit.D during third trimester is <= 20 are assigned Condition Low
- ##### Patients with cord blood Vitamin D, mother's base Vit.D, Vit.D during third trimester is >=30 are assigned Condition High


In [6]:
print('Number of patients: ', meta.shape[0])

Number of patients:  443


#### Choose annotations of protein coding genes

In [7]:
# Save human annotations in R
# Run download_annotations.ipynb

annot = pd.read_csv(os.path.join(PATH_DATA, 'Annotations.tsv'),sep='\t', low_memory=False)
print(annot.shape)


(68283, 8)


#### Save counts data

In [8]:
counts = counts[meta.index]

## Remove genes with less than 10 reads
# counts = counts[counts.sum(axis = 1) > 10] 
counts.to_csv(os.path.join(PATH_DATA, 'gene_counts.tsv'), sep='\t')
print(counts.shape)


(65988, 443)


In [9]:
print('There are a total of {} genes'.format(counts.shape[0]))
print('There are a total of {} patients'.format(counts.shape[1]))

There are a total of 65988 genes
There are a total of 443 patients


## Load phenotype

In [10]:
asthma_pheno = pyreadr.read_r(os.path.join(PATH, 'vdaart/asthmaDF.rda'))
asthma_pheno = pd.DataFrame(asthma_pheno['asthmaDF'] ).set_index(['vid'])
asthma_pheno.index =  asthma_pheno.index.astype(str)
asthma_pheno = asthma_pheno.loc[asthma_pheno.index.isin(counts.columns),:]
# asthma_pheno.to_csv(os.path.join(PATH_DATA, 'phenotype.tsv'), sep='\t')
asthma_pheno.head()

Unnamed: 0_level_0,anyAsthma,recurrWheeze,asthmaWheeze,lowerbAsthma,upperbAsthma,pseudoTime,pseudoEvent
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110188,False,True,True,1.508556,1.7577,1.633128,1.0
111485,False,True,True,0.772074,1.026694,0.899384,1.0
111733,False,False,False,0.596851,,0.596851,0.0
112434,True,False,True,5.114305,5.281314,5.19781,1.0
113223,False,False,False,1.549624,,1.549624,0.0


In [11]:
pheno = pd.read_csv('Data/babyracen819.csv')
pheno['vid'] = pheno['vid'].astype(str)
pheno = pheno.set_index(['vid'])
cols = ['deliverydate', 'site', 'bdeath','trmt', 'cgender', 'cethnicity', 'crace', 'craceeth']
pheno = pheno.loc[meta.index, cols]
pheno.head()

Unnamed: 0,deliverydate,site,bdeath,trmt,cgender,cethnicity,crace,craceeth
110188,03/09/2011,1.0,No,B,Female,Not Hispanic or Latino,Asian,Other
111485,01/17/2011,1.0,No,A,Male,Not Hispanic or Latino,"Black, African American",African American (Hispanic and Non-Hispanic)
111733,09/22/2010,1.0,No,B,Male,Not Hispanic or Latino,Asian,Other
112434,09/30/2010,1.0,No,A,Female,Hispanic or Latino,Asian,Other
113223,09/29/2010,1.0,No,B,Female,Not Hispanic or Latino,White,Caucasian (Non-Hispanic)


In [12]:
## Save metadata
meta = pd.merge(meta, pheno, left_index =  True, right_index = True)
meta['craceeth'] = meta['craceeth'].str.replace('\(|\)|\-| ','')#.replace(' ','_')
meta['cethnicity'] = meta['cethnicity'].str.replace(' ','_')
meta['crace'] = meta['crace'].str.replace(' |\,','_')
# meta = pd.merge(meta, asthma_pheno['pseudoEvent'], left_index =  True, right_index = True)
meta.to_csv(os.path.join(PATH_DATA, 'metadata.tsv'), sep='\t')

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [13]:
meta

Unnamed: 0,basevitdng,wk32to38vitdng,mayocbloodvitd,condition,deliverydate,site,bdeath,trmt,cgender,cethnicity,crace,craceeth
110188,33.10,13.3,7.9,,03/09/2011,1.0,No,B,Female,Not_Hispanic_or_Latino,Asian,Other
111485,22.10,44.6,22.0,,01/17/2011,1.0,No,A,Male,Not_Hispanic_or_Latino,Black__African_American,AfricanAmericanHispanicandNonHispanic
111733,16.40,30.3,18.0,,09/22/2010,1.0,No,B,Male,Not_Hispanic_or_Latino,Asian,Other
112434,7.94,54.5,25.0,,09/30/2010,1.0,No,A,Female,Hispanic_or_Latino,Asian,Other
113223,40.30,45.9,47.0,1.0,09/29/2010,1.0,No,B,Female,Not_Hispanic_or_Latino,White,CaucasianNonHispanic
...,...,...,...,...,...,...,...,...,...,...,...,...
396641,25.60,36.6,38.0,1.0,03/04/2011,3.0,No,A,Male,Not_Hispanic_or_Latino,Black__African_American,AfricanAmericanHispanicandNonHispanic
398087,16.70,35.0,26.0,,07/21/2010,3.0,No,B,Female,Not_Hispanic_or_Latino,Black__African_American,AfricanAmericanHispanicandNonHispanic
399652,25.10,28.9,18.0,,07/05/2011,3.0,No,B,Female,Not_Hispanic_or_Latino,White,CaucasianNonHispanic
399773,36.10,28.2,12.0,,02/13/2011,3.0,No,B,Female,Not_Hispanic_or_Latino,Black__African_American,AfricanAmericanHispanicandNonHispanic


In [14]:
meta.condition.value_counts()

1.0    97
0.0    95
Name: condition, dtype: int64