In [1]:
import pandas as pd
import numpy as np
import sys
from common import commons
home = commons.home
from features_preprocess import get_winid
import os
import re

In [2]:
def nearest_tss(tss,sites_df):
    merged = pd.merge(sites_df,tss,how='outer',on=['chr','coordinate'])
    merged.sort_values(['chr','coordinate'],inplace=True)
    merged.rename(columns={'strand':'before_tss'},inplace=True)
    merged.ix[merged['before_tss'].isnull()==False, 'before_tss'] = merged.ix[merged['before_tss'].isnull()==False,'coordinate']
    merged['after_tss'] = merged['before_tss']
    merged['before_tss'].fillna(method='ffill', inplace=True)
    merged['after_tss'].fillna(method='bfill',inplace=True)
    merged['dist_to_before_tss'] = np.abs(merged['coordinate']-merged['before_tss'])
    merged['dist_to_after_tss'] = np.abs(merged['coordinate']-merged['after_tss'])
    merged['tss'] = None
    before_ix = (merged['dist_to_before_tss'] < merged['dist_to_after_tss']) | (merged['dist_to_after_tss'].isnull())
    merged.ix[before_ix,'tss'] = merged.ix[before_ix,'before_tss']
    after_ix = (merged['dist_to_before_tss'] >= merged['dist_to_after_tss']) | (merged['dist_to_before_tss'].isnull())
    merged.ix[after_ix,'tss'] = merged.ix[after_ix,'after_tss']
    merged['dist_to_nearest_tss'] = np.abs(merged['coordinate']-merged['tss']) 
    merged.drop(['before_tss','after_tss','tss','dist_to_before_tss','dist_to_after_tss'],axis=1,inplace=True)
    merged.dropna(axis=0,subset=['id'],inplace=True)
    return merged

def rename_features(x):   #rename repetitive features
    features = np.array(x.columns)
    features_count = pd.Series(index=x.columns.unique())
    features_count = features_count.fillna(int(0))
    for i,name in enumerate(x.columns):
        if features_count[name] == 0:
            features_count[name] += 1
        else:
            features[i] = name+str(features_count[name])
            features_count[name] += 1
    x.columns = features
    return 

In [3]:
dataset = 'AD_CpG'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type
with pd.HDFStore(home+'data/'+dataset+'/all_sites_winid','r') as h5s:
    all_sites = h5s['all_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [4]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,start,winid,end
0,cg27573606,1,838379.0,0.685193,0.493542,0.965170,0,838201,4192,838400
1,cg04121631,1,875345.0,-0.680762,0.496340,0.142312,0,875201,4377,875400
2,cg16043700,1,1114523.0,0.154428,0.877335,0.702867,0,1114401,5573,1114600
3,cg24807889,1,1135955.0,0.861507,0.389375,0.744596,0,1135801,5680,1136000
4,cg24470133,1,1483206.0,0.028189,0.977523,0.960068,0,1483201,7417,1483400
5,cg25031205,1,1688976.0,0.389879,0.696794,0.894453,0,1688801,8445,1689000
6,cg01520454,1,2304375.0,0.263077,0.792600,0.781696,0,2304201,11522,2304400
7,cg07141317,1,2327775.0,0.386512,0.699283,0.853852,0,2327601,11639,2327800
8,cg08655953,1,2425888.0,0.891032,0.373344,0.968990,0,2425801,12130,2426000
9,cg07788537,1,2799876.0,0.318171,0.750489,0.836434,0,2799801,14000,2800000


In [5]:
feature_dir = home+'data/features/'+dataset+'/'
files = os.listdir(feature_dir)
pattern = '.*all.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]




In [6]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

31
267
317
73
80
735
303


In [7]:
rename_features(all_sites)

In [8]:
all_sites.shape

(1468, 1816)

In [16]:
columns = all_sites.columns.values

In [17]:
columns = all_sites.columns.values
pattern = re.compile(r'.*WGBS.*')
i=0
for col in columns:
    if len(pattern.findall(col))>0:
        print(col)
        i += 1
print(i)

ENCFF003JVR_WGBS_counts
ENCFF043NUK_WGBS_counts
ENCFF064GJQ_WGBS_counts
ENCFF092FNE_WGBS_counts
ENCFF103DNU_WGBS_counts
ENCFF116DGM_WGBS_counts
ENCFF121VIX_WGBS_counts
ENCFF121ZES_WGBS_counts
ENCFF164EAU_WGBS_counts
ENCFF168HTX_WGBS_counts
ENCFF179VKR_WGBS_counts
ENCFF189WPY_WGBS_counts
ENCFF200MJQ_WGBS_counts
ENCFF210XTE_WGBS_counts
ENCFF219GCQ_WGBS_counts
ENCFF223LJW_WGBS_counts
ENCFF247ILV_WGBS_counts
ENCFF254DBF_WGBS_counts
ENCFF266NGW_WGBS_counts
ENCFF279HCL_WGBS_counts
ENCFF297CJG_WGBS_counts
ENCFF303ZGP_WGBS_counts
ENCFF315ZJB_WGBS_counts
ENCFF318AMC_WGBS_counts
ENCFF331VRY_WGBS_counts
ENCFF333OHK_WGBS_counts
ENCFF355UVU_WGBS_counts
ENCFF366UWF_WGBS_counts
ENCFF428TVT_WGBS_counts
ENCFF459EEM_WGBS_counts
ENCFF477AUC_WGBS_counts
ENCFF477GKI_WGBS_counts
ENCFF479QJK_WGBS_counts
ENCFF487XOB_WGBS_counts
ENCFF489CEV_WGBS_counts
ENCFF500DKA_WGBS_counts
ENCFF510EMT_WGBS_counts
ENCFF511FUP_WGBS_counts
ENCFF513ITC_WGBS_counts
ENCFF536RSX_WGBS_counts
ENCFF545MIY_WGBS_counts
ENCFF550FZT_WGBS

In [9]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [10]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [11]:
all_sites.ix[:,:1950]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,winid,A549,Astrocy,...,ENCFF723ZMR_RNASeq_counts,ENCFF301ROZ_RNASeq_counts,ENCFF888ZFS_RNASeq_counts,ENCFF105THO_RNASeq_counts,ENCFF760IDU_RNASeq_counts,ENCFF624VBI_RNASeq_counts,ENCFF552FTX_RNASeq_counts,ENCFF623UTC_RNASeq_counts,ENCFF535JQR_RNASeq_counts,ENCFF003JVR_WGBS_counts
0,cg27573606,1,838379.0,0.685193,0.493542,0.965170,0,4192,3,2,...,0.0,0.0,1.0,0.0,8.0,0.0,0.0,2.0,15.0,10.0
1,cg04121631,1,875345.0,-0.680762,0.496340,0.142312,0,4377,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0
2,cg16043700,1,1114523.0,0.154428,0.877335,0.702867,0,5573,1,2,...,0.0,0.0,6.0,0.0,4.0,2.0,0.0,0.0,17.0,5.0
3,cg24807889,1,1135955.0,0.861507,0.389375,0.744596,0,5680,3,4,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,1.0
4,cg24470133,1,1483206.0,0.028189,0.977523,0.960068,0,7417,0,3,...,4.0,0.0,9.0,3.0,22.0,6.0,0.0,5.0,18.0,20.0
5,cg25031205,1,1688976.0,0.389879,0.696794,0.894453,0,8445,3,2,...,4.0,3.0,1.0,5.0,2.0,11.0,2.0,13.0,3.0,5.0
6,cg01520454,1,2304375.0,0.263077,0.792600,0.781696,0,11522,10,3,...,0.0,0.0,21.0,0.0,20.0,6.0,0.0,2.0,148.0,1.0
7,cg07141317,1,2327775.0,0.386512,0.699283,0.853852,0,11639,5,6,...,8.0,2.0,0.0,26.0,0.0,42.0,17.0,22.0,0.0,22.0
8,cg08655953,1,2425888.0,0.891032,0.373344,0.968990,0,12130,9,6,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,1.0,0.0
9,cg07788537,1,2799876.0,0.318171,0.750489,0.836434,0,14000,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [12]:
#nearest tss distance    
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [13]:
with pd.HDFStore(home+'data/'+dataset+'/all_features','w') as h5s:
    h5s['all_features'] = all_sites

In [3]:
###all 450K sites features
dataset = 'AD_CpG'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type


In [23]:
with pd.HDFStore(home+'data/'+dataset.split('/')[0]+'/all_450k_sites_winid','r') as h5s:
    all_sites = h5s['all_450k_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [21]:
all_sites

Unnamed: 0,id,chr,coordinate,pvalue,beta,winid,A549-ATF3,A549-BCL3,A549-CEBPB,A549-CREB1,...,NCFF795DNO_WGBS_counts,NCFF801OHX_WGBS_counts,NCFF811QOG_WGBS_counts,NCFF831OYO_WGBS_counts,NCFF843SYR_WGBS_counts,NCFF847OWL_WGBS_counts,NCFF874GGB_WGBS_counts,NCFF913ZNZ_WGBS_counts,NCFF923CZC_WGBS_counts,dist_to_nearest_tss
0,cg13869341,1,15865.0,0.977843,0.877442,80.0,4.0,11.0,0.0,0.0,...,0.0,13.0,0.0,7.0,0.0,0.0,11.0,3.0,0.0,3991.0
1,cg24669183,1,534242.0,0.818908,0.786274,2672.0,6.0,11.0,1.0,1.0,...,0.0,16.0,14.0,4.0,9.0,2.0,9.0,12.0,12.0,87792.0
2,cg15560884,1,710097.0,0.843290,0.652625,3551.0,3.0,3.0,0.0,2.0,...,3.0,72.0,74.0,32.0,83.0,38.0,18.0,62.0,21.0,3971.0
3,cg01014490,1,714177.0,0.445636,0.008987,3571.0,24.0,24.0,4.0,2371.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.0
4,cg17505339,1,720865.0,0.211757,0.859844,3605.0,2.0,0.0,0.0,0.0,...,7.0,5.0,28.0,17.0,35.0,23.0,30.0,21.0,16.0,6797.0
5,cg11954957,1,758829.0,0.139721,0.816452,3795.0,3.0,2.0,0.0,0.0,...,3.0,54.0,26.0,33.0,19.0,31.0,10.0,13.0,7.0,4073.0
6,cg23803172,1,763119.0,0.667176,0.004858,3816.0,13.0,10.0,0.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0
7,cg16736630,1,779995.0,0.338102,0.758187,3900.0,2.0,1.0,0.0,2.0,...,2.0,71.0,23.0,13.0,16.0,26.0,32.0,15.0,9.0,16817.0
8,cg05898754,1,805102.0,0.502358,0.359355,4026.0,6.0,5.0,5.0,10.0,...,1.0,7.0,5.0,3.0,3.0,0.0,8.0,2.0,2.0,7080.0
9,cg03128332,1,805338.0,0.439849,0.142779,4027.0,13.0,10.0,2.0,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6844.0


In [7]:
feature_dir = home+'data/features/'+dataset+'/'
files = os.listdir(feature_dir)
pattern = '.*all_450k.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]

In [8]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

317
267
80
31
735
303
73


In [9]:
rename_features(all_sites)

In [10]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [11]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'all_450k_addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [15]:
#nearest tss distance 
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [16]:
all_sites

Unnamed: 0,id,chr,coordinate,pvalue,beta,winid,A549-ATF3,A549-BCL3,A549-CEBPB,A549-CREB1,...,NCFF795DNO_WGBS_counts,NCFF801OHX_WGBS_counts,NCFF811QOG_WGBS_counts,NCFF831OYO_WGBS_counts,NCFF843SYR_WGBS_counts,NCFF847OWL_WGBS_counts,NCFF874GGB_WGBS_counts,NCFF913ZNZ_WGBS_counts,NCFF923CZC_WGBS_counts,dist_to_nearest_tss
0,cg13869341,1,15865.0,0.977843,0.877442,80.0,4.0,11.0,0.0,0.0,...,0.0,13.0,0.0,7.0,0.0,0.0,11.0,3.0,0.0,3991.0
1,cg24669183,1,534242.0,0.818908,0.786274,2672.0,6.0,11.0,1.0,1.0,...,0.0,16.0,14.0,4.0,9.0,2.0,9.0,12.0,12.0,87792.0
2,cg15560884,1,710097.0,0.843290,0.652625,3551.0,3.0,3.0,0.0,2.0,...,3.0,72.0,74.0,32.0,83.0,38.0,18.0,62.0,21.0,3971.0
3,cg01014490,1,714177.0,0.445636,0.008987,3571.0,24.0,24.0,4.0,2371.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.0
4,cg17505339,1,720865.0,0.211757,0.859844,3605.0,2.0,0.0,0.0,0.0,...,7.0,5.0,28.0,17.0,35.0,23.0,30.0,21.0,16.0,6797.0
5,cg11954957,1,758829.0,0.139721,0.816452,3795.0,3.0,2.0,0.0,0.0,...,3.0,54.0,26.0,33.0,19.0,31.0,10.0,13.0,7.0,4073.0
6,cg23803172,1,763119.0,0.667176,0.004858,3816.0,13.0,10.0,0.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0
7,cg16736630,1,779995.0,0.338102,0.758187,3900.0,2.0,1.0,0.0,2.0,...,2.0,71.0,23.0,13.0,16.0,26.0,32.0,15.0,9.0,16817.0
8,cg05898754,1,805102.0,0.502358,0.359355,4026.0,6.0,5.0,5.0,10.0,...,1.0,7.0,5.0,3.0,3.0,0.0,8.0,2.0,2.0,7080.0
9,cg03128332,1,805338.0,0.439849,0.142779,4027.0,13.0,10.0,2.0,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6844.0


In [19]:
all_sites

Unnamed: 0,id,chr,coordinate,pvalue,beta,winid,A549-ATF3,A549-BCL3,A549-CEBPB,A549-CREB1,...,NCFF795DNO_WGBS_counts,NCFF801OHX_WGBS_counts,NCFF811QOG_WGBS_counts,NCFF831OYO_WGBS_counts,NCFF843SYR_WGBS_counts,NCFF847OWL_WGBS_counts,NCFF874GGB_WGBS_counts,NCFF913ZNZ_WGBS_counts,NCFF923CZC_WGBS_counts,dist_to_nearest_tss
0,cg13869341,1,15865.0,0.977843,0.877442,80.0,4.0,11.0,0.0,0.0,...,0.0,13.0,0.0,7.0,0.0,0.0,11.0,3.0,0.0,3991.0
1,cg24669183,1,534242.0,0.818908,0.786274,2672.0,6.0,11.0,1.0,1.0,...,0.0,16.0,14.0,4.0,9.0,2.0,9.0,12.0,12.0,87792.0
2,cg15560884,1,710097.0,0.843290,0.652625,3551.0,3.0,3.0,0.0,2.0,...,3.0,72.0,74.0,32.0,83.0,38.0,18.0,62.0,21.0,3971.0
3,cg01014490,1,714177.0,0.445636,0.008987,3571.0,24.0,24.0,4.0,2371.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.0
4,cg17505339,1,720865.0,0.211757,0.859844,3605.0,2.0,0.0,0.0,0.0,...,7.0,5.0,28.0,17.0,35.0,23.0,30.0,21.0,16.0,6797.0
5,cg11954957,1,758829.0,0.139721,0.816452,3795.0,3.0,2.0,0.0,0.0,...,3.0,54.0,26.0,33.0,19.0,31.0,10.0,13.0,7.0,4073.0
6,cg23803172,1,763119.0,0.667176,0.004858,3816.0,13.0,10.0,0.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0
7,cg16736630,1,779995.0,0.338102,0.758187,3900.0,2.0,1.0,0.0,2.0,...,2.0,71.0,23.0,13.0,16.0,26.0,32.0,15.0,9.0,16817.0
8,cg05898754,1,805102.0,0.502358,0.359355,4026.0,6.0,5.0,5.0,10.0,...,1.0,7.0,5.0,3.0,3.0,0.0,8.0,2.0,2.0,7080.0
9,cg03128332,1,805338.0,0.439849,0.142779,4027.0,13.0,10.0,2.0,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6844.0


In [20]:
with pd.HDFStore(home+'data/AD_CpG/all_450k_features','w') as h5s:
    h5s['all_450k_features'] = all_sites