In [1]:
import pandas as pd
import numpy as np
import sys
from common import commons
home = commons.home
from features_preprocess import get_winid
import os
import re

In [2]:
def nearest_tss(tss,sites_df):
    merged = pd.merge(sites_df,tss,how='outer',on=['chr','coordinate'])
    merged.sort_values(['chr','coordinate'],inplace=True)
    merged.rename(columns={'strand':'before_tss'},inplace=True)
    merged.ix[merged['before_tss'].isnull()==False, 'before_tss'] = merged.ix[merged['before_tss'].isnull()==False,'coordinate']
    merged['after_tss'] = merged['before_tss']
    merged['before_tss'].fillna(method='ffill', inplace=True)
    merged['after_tss'].fillna(method='bfill',inplace=True)
    merged['dist_to_before_tss'] = np.abs(merged['coordinate']-merged['before_tss'])
    merged['dist_to_after_tss'] = np.abs(merged['coordinate']-merged['after_tss'])
    merged['tss'] = None
    before_ix = (merged['dist_to_before_tss'] < merged['dist_to_after_tss']) | (merged['dist_to_after_tss'].isnull())
    merged.ix[before_ix,'tss'] = merged.ix[before_ix,'before_tss']
    after_ix = (merged['dist_to_before_tss'] >= merged['dist_to_after_tss']) | (merged['dist_to_before_tss'].isnull())
    merged.ix[after_ix,'tss'] = merged.ix[after_ix,'after_tss']
    merged['dist_to_nearest_tss'] = np.abs(merged['coordinate']-merged['tss']) 
    merged.drop(['before_tss','after_tss','tss','dist_to_before_tss','dist_to_after_tss'],axis=1,inplace=True)
    merged.dropna(axis=0,subset=['id'],inplace=True)
    return merged

def rename_features(x):   #rename repetitive features
    features = np.array(x.columns)
    features_count = pd.Series(index=x.columns.unique())
    features_count = features_count.fillna(int(0))
    for i,name in enumerate(x.columns):
        if features_count[name] == 0:
            features_count[name] += 1
        else:
            features[i] = name+str(features_count[name])
            features_count[name] += 1
    x.columns = features
    return 

In [3]:
dataset = 'Cd'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type
with pd.HDFStore(home+'data/'+dataset+'/all_sites_winid','r') as h5s:
    all_sites = h5s['all_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [4]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,start,winid,end
0,cg23440882,1,875880,0.0238,0.358200,0.030570,0,875801,4380,876000
1,cg24685837,1,982225,0.0459,0.169100,0.604002,0,982201,4912,982400
2,cg02494066,1,983386,0.0421,0.238000,0.587702,0,983201,4917,983400
3,cg20685419,1,1007730,0.0436,0.229800,0.443294,0,1007601,5039,1007800
4,cg00305285,1,1017115,0.0044,0.926000,0.722598,0,1017001,5086,1017200
5,cg15207999,1,1021210,0.0071,0.810100,0.802394,0,1021201,5107,1021400
6,cg05929553,1,1086836,0.0199,0.577400,0.835167,0,1086801,5435,1087000
7,cg07115976,1,1155731,-0.1745,0.000138,0.906688,-1,1155601,5779,1155800
8,cg00211609,1,1178039,0.0233,0.451700,0.402827,0,1178001,5891,1178200
9,cg02136596,1,1384930,0.0116,0.787200,0.776592,0,1384801,6925,1385000


In [5]:
feature_dir = home+'data/features/'+dataset+'/'
files = os.listdir(feature_dir)
pattern = '.*all.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]




In [6]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

303
31
267
317
735
73
80


In [7]:
rename_features(all_sites)

In [8]:
all_sites.shape

(3008, 1816)

In [9]:
columns = all_sites.columns.values

In [10]:
columns = all_sites.columns.values
pattern = re.compile(r'.*WGBS.*')
i=0
for col in columns:
    if len(pattern.findall(col))>0:
        print(col)
        i += 1
print(i)

0


In [11]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [12]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [13]:
all_sites.ix[:,:1950]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,winid,A549-BHLHE40,A549-CEBPB,...,ENCFF723ZMR_RNASeq_counts,ENCFF301ROZ_RNASeq_counts,ENCFF888ZFS_RNASeq_counts,ENCFF105THO_RNASeq_counts,ENCFF760IDU_RNASeq_counts,ENCFF624VBI_RNASeq_counts,ENCFF552FTX_RNASeq_counts,ENCFF623UTC_RNASeq_counts,ENCFF535JQR_RNASeq_counts,ENCFF003JVR_WGBS_counts
0,cg08750554,1,1005100.0,0.460842,6.451141e-01,0.221929,0,5026,2,3,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0
1,cg13362546,1,1102960.0,0.582537,5.604699e-01,0.780087,0,5515,3,0,...,0.0,0.0,1.0,0.0,1.0,19.0,0.0,6.0,2.0,0.0
2,cg22373622,1,1489544.0,4.490939,8.830000e-06,0.918745,1,7448,2,0,...,0.0,5.0,5.0,13.0,16.0,26.0,7.0,8.0,15.0,19.0
3,cg16926213,1,1841314.0,5.065649,5.750000e-07,0.372401,1,9207,4,3,...,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,30.0,12.0
4,cg12656307,1,2349734.0,0.861177,3.895565e-01,0.401944,0,11749,5,1,...,0.0,0.0,0.0,2.0,2.0,4.0,2.0,2.0,6.0,6.0
5,cg00373616,1,2434487.0,0.220757,8.253727e-01,0.769612,0,12173,2,0,...,1.0,0.0,0.0,1.0,0.0,163.0,0.0,5.0,0.0,4.0
6,cg25618424,1,2989307.0,0.507053,6.123429e-01,0.509518,0,14947,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0
7,cg09119863,1,3129238.0,0.435539,6.633605e-01,0.697770,0,15647,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
8,cg01079872,1,3302477.0,0.678813,4.975729e-01,0.904952,0,16513,0,0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
9,cg00571809,1,3397113.0,0.234530,8.146707e-01,0.947405,0,16986,3,1,...,39.0,1.0,0.0,0.0,0.0,31.0,1.0,4.0,3.0,8.0


In [14]:
#nearest tss distance    
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [15]:
with pd.HDFStore(home+'data/'+dataset+'/all_features','w') as h5s:
    h5s['all_features'] = all_sites

In [9]:
###all 450K sites features, only need to RUN ONCE
dataset = 'Cd'
feature_dir = home+'data/features/'+dataset+'/'
all_450_features = home+'data/'+dataset+'/all_450k_features'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type


In [10]:
with pd.HDFStore(home+'data/'+dataset+'/all_450k_sites_winid','r') as h5s:
    all_sites = h5s['all_450k_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [11]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,start,winid,end
0,cg13869341,1,15865,0.0426,0.297000,0.862405,15801,80,16000
1,cg24669183,1,534242,-0.0561,0.421300,0.803156,534201,2672,534400
2,cg15560884,1,710097,-0.0058,0.788800,0.662852,710001,3551,710200
3,cg01014490,1,714177,-0.0243,0.375500,0.016537,714001,3571,714200
4,cg17505339,1,720865,-0.0853,0.027480,0.967550,720801,3605,721000
5,cg05898754,1,805102,0.0182,0.659200,0.177819,805001,4026,805200
6,cg03128332,1,805338,0.0092,0.882000,0.045307,805201,4027,805400
7,cg16619049,1,805541,0.0007,0.991300,0.283757,805401,4028,805600
8,cg05475702,1,812248,0.0401,0.038670,0.472167,812201,4062,812400
9,cg18147296,1,812539,-0.0007,0.976800,0.311583,812401,4063,812600


In [12]:
feature_dir = home+'data/features/'+dataset.split('/')[0]+'/'
files = os.listdir(feature_dir)
pattern = '.*all_450k.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]

In [13]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

317
31
735
303
73
267
80


In [14]:
rename_features(all_sites)

In [15]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [16]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'all_450k_addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [18]:
#nearest tss distance 
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [19]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,winid,A549-ATF3,A549-BCL3,A549-CEBPB,...,NCFF795DNO_WGBS_counts,NCFF801OHX_WGBS_counts,NCFF811QOG_WGBS_counts,NCFF831OYO_WGBS_counts,NCFF843SYR_WGBS_counts,NCFF847OWL_WGBS_counts,NCFF874GGB_WGBS_counts,NCFF913ZNZ_WGBS_counts,NCFF923CZC_WGBS_counts,dist_to_nearest_tss
0,cg13869341,1,15865,0.0426,0.297000,0.862405,80.0,4.0,11.0,0.0,...,0.0,13.0,0.0,7.0,0.0,0.0,11.0,3.0,0.0,3991
1,cg24669183,1,534242,-0.0561,0.421300,0.803156,2672.0,6.0,11.0,1.0,...,0.0,16.0,14.0,4.0,9.0,2.0,9.0,12.0,12.0,87792
2,cg15560884,1,710097,-0.0058,0.788800,0.662852,3551.0,3.0,3.0,0.0,...,3.0,72.0,74.0,32.0,83.0,38.0,18.0,62.0,21.0,3971
3,cg01014490,1,714177,-0.0243,0.375500,0.016537,3571.0,24.0,24.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109
4,cg17505339,1,720865,-0.0853,0.027480,0.967550,3605.0,2.0,0.0,0.0,...,7.0,5.0,28.0,17.0,35.0,23.0,30.0,21.0,16.0,6797
5,cg05898754,1,805102,0.0182,0.659200,0.177819,4026.0,6.0,5.0,5.0,...,1.0,7.0,5.0,3.0,3.0,0.0,8.0,2.0,2.0,7080
6,cg03128332,1,805338,0.0092,0.882000,0.045307,4027.0,13.0,10.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6844
7,cg16619049,1,805541,0.0007,0.991300,0.283757,4028.0,9.0,6.0,0.0,...,2.0,37.0,6.0,2.0,7.0,0.0,0.0,6.0,2.0,6641
8,cg05475702,1,812248,0.0401,0.038670,0.472167,4062.0,6.0,3.0,0.0,...,1.0,23.0,29.0,21.0,23.0,11.0,3.0,14.0,5.0,66
9,cg18147296,1,812539,-0.0007,0.976800,0.311583,4063.0,5.0,4.0,1.0,...,4.0,61.0,18.0,12.0,12.0,0.0,5.0,15.0,6.0,357


In [20]:
with pd.HDFStore(all_450_features,'w') as h5s:
    h5s['all_450k_features'] = all_sites.drop(['pvalue','beta'],axis=1)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['id', 'dist_to_nearest_tss']]

  exec(code_obj, self.user_global_ns, self.user_ns)
