In [1]:
import pandas as pd
import numpy as np
import sys
from common import commons
home = commons.home
from features_preprocess import get_winid
import os
import re

In [2]:
def nearest_tss(tss,sites_df):
    merged = pd.merge(sites_df,tss,how='outer',on=['chr','coordinate'])
    merged.sort_values(['chr','coordinate'],inplace=True)
    merged.rename(columns={'strand':'before_tss'},inplace=True)
    merged.ix[merged['before_tss'].isnull()==False, 'before_tss'] = merged.ix[merged['before_tss'].isnull()==False,'coordinate']
    merged['after_tss'] = merged['before_tss']
    merged['before_tss'].fillna(method='ffill', inplace=True)
    merged['after_tss'].fillna(method='bfill',inplace=True)
    merged['dist_to_before_tss'] = np.abs(merged['coordinate']-merged['before_tss'])
    merged['dist_to_after_tss'] = np.abs(merged['coordinate']-merged['after_tss'])
    merged['tss'] = None
    before_ix = (merged['dist_to_before_tss'] < merged['dist_to_after_tss']) | (merged['dist_to_after_tss'].isnull())
    merged.ix[before_ix,'tss'] = merged.ix[before_ix,'before_tss']
    after_ix = (merged['dist_to_before_tss'] >= merged['dist_to_after_tss']) | (merged['dist_to_before_tss'].isnull())
    merged.ix[after_ix,'tss'] = merged.ix[after_ix,'after_tss']
    merged['dist_to_nearest_tss'] = np.abs(merged['coordinate']-merged['tss']) 
    merged.drop(['before_tss','after_tss','tss','dist_to_before_tss','dist_to_after_tss'],axis=1,inplace=True)
    merged.dropna(axis=0,subset=['id'],inplace=True)
    return merged

def rename_features(x):   #rename repetitive features
    features = np.array(x.columns)
    features_count = pd.Series(index=x.columns.unique())
    features_count = features_count.fillna(int(0))
    for i,name in enumerate(x.columns):
        if features_count[name] == 0:
            features_count[name] += 1
        else:
            features[i] = name+str(features_count[name])
            features_count[name] += 1
    x.columns = features
    return 

In [3]:
dataset = 'Cd'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type
with pd.HDFStore(home+'data/'+dataset+'/all_sites_winid','r') as h5s:
    all_sites = h5s['all_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [4]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,start,winid,end
0,cg08750554,1,1005100.0,0.460842,6.451141e-01,0.221929,0,1005001,5026,1005200
1,cg13362546,1,1102960.0,0.582537,5.604699e-01,0.780087,0,1102801,5515,1103000
2,cg22373622,1,1489544.0,4.490939,8.830000e-06,0.918745,1,1489401,7448,1489600
3,cg16926213,1,1841314.0,5.065649,5.750000e-07,0.372401,1,1841201,9207,1841400
4,cg12656307,1,2349734.0,0.861177,3.895565e-01,0.401944,0,2349601,11749,2349800
5,cg00373616,1,2434487.0,0.220757,8.253727e-01,0.769612,0,2434401,12173,2434600
6,cg25618424,1,2989307.0,0.507053,6.123429e-01,0.509518,0,2989201,14947,2989400
7,cg09119863,1,3129238.0,0.435539,6.633605e-01,0.697770,0,3129201,15647,3129400
8,cg01079872,1,3302477.0,0.678813,4.975729e-01,0.904952,0,3302401,16513,3302600
9,cg00571809,1,3397113.0,0.234530,8.146707e-01,0.947405,0,3397001,16986,3397200


In [5]:
feature_dir = home+'data/features/'+dataset+'/'
files = os.listdir(feature_dir)
pattern = '.*all.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]




In [6]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

303
31
267
317
735
73
80


In [7]:
rename_features(all_sites)

In [8]:
all_sites.shape

(1332, 1816)

In [9]:
columns = all_sites.columns.values

In [10]:
columns = all_sites.columns.values
pattern = re.compile(r'.*WGBS.*')
i=0
for col in columns:
    if len(pattern.findall(col))>0:
        print(col)
        i += 1
print(i)

0


In [11]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [12]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [13]:
all_sites.ix[:,:1950]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,winid,A549-BHLHE40,A549-CEBPB,...,ENCFF723ZMR_RNASeq_counts,ENCFF301ROZ_RNASeq_counts,ENCFF888ZFS_RNASeq_counts,ENCFF105THO_RNASeq_counts,ENCFF760IDU_RNASeq_counts,ENCFF624VBI_RNASeq_counts,ENCFF552FTX_RNASeq_counts,ENCFF623UTC_RNASeq_counts,ENCFF535JQR_RNASeq_counts,ENCFF003JVR_WGBS_counts
0,cg08750554,1,1005100.0,0.460842,6.451141e-01,0.221929,0,5026,2,3,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0
1,cg13362546,1,1102960.0,0.582537,5.604699e-01,0.780087,0,5515,3,0,...,0.0,0.0,1.0,0.0,1.0,19.0,0.0,6.0,2.0,0.0
2,cg22373622,1,1489544.0,4.490939,8.830000e-06,0.918745,1,7448,2,0,...,0.0,5.0,5.0,13.0,16.0,26.0,7.0,8.0,15.0,19.0
3,cg16926213,1,1841314.0,5.065649,5.750000e-07,0.372401,1,9207,4,3,...,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,30.0,12.0
4,cg12656307,1,2349734.0,0.861177,3.895565e-01,0.401944,0,11749,5,1,...,0.0,0.0,0.0,2.0,2.0,4.0,2.0,2.0,6.0,6.0
5,cg00373616,1,2434487.0,0.220757,8.253727e-01,0.769612,0,12173,2,0,...,1.0,0.0,0.0,1.0,0.0,163.0,0.0,5.0,0.0,4.0
6,cg25618424,1,2989307.0,0.507053,6.123429e-01,0.509518,0,14947,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0
7,cg09119863,1,3129238.0,0.435539,6.633605e-01,0.697770,0,15647,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
8,cg01079872,1,3302477.0,0.678813,4.975729e-01,0.904952,0,16513,0,0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
9,cg00571809,1,3397113.0,0.234530,8.146707e-01,0.947405,0,16986,3,1,...,39.0,1.0,0.0,0.0,0.0,31.0,1.0,4.0,3.0,8.0


In [14]:
#nearest tss distance    
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [15]:
with pd.HDFStore(home+'data/'+dataset+'/all_features','w') as h5s:
    h5s['all_features'] = all_sites

In [3]:
###all 450K sites features, only need to RUN ONCE
dataset = 'AD_CpG'
feature_dir = home+'data/features/'+dataset+'/'
all_450_features = home+'data/'+dataset+'/all_450k_features'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type


In [76]:
with pd.HDFStore(home+'data/'+dataset+'/all_450k_sites_winid','r') as h5s:
    all_sites = h5s['all_450k_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [77]:
all_sites

Unnamed: 0,id,chr,coordinate,pvalue,beta,start,winid,end
0,cg13869341,1,15865.0,0.239378,0.877442,15801,80,16000
1,cg24669183,1,534242.0,0.717648,0.786274,534201,2672,534400
2,cg15560884,1,710097.0,0.716762,0.652625,710001,3551,710200
3,cg01014490,1,714177.0,0.311382,0.008987,714001,3571,714200
4,cg17505339,1,720865.0,0.093356,0.859844,720801,3605,721000
5,cg11954957,1,758829.0,0.822191,0.816452,758801,3795,759000
6,cg23803172,1,763119.0,0.882293,0.004858,763001,3816,763200
7,cg16736630,1,779995.0,0.158965,0.758187,779801,3900,780000
8,cg05898754,1,805102.0,0.525479,0.359355,805001,4026,805200
9,cg03128332,1,805338.0,0.666332,0.142779,805201,4027,805400


In [78]:
feature_dir = home+'data/features/'+dataset.split('/')[0]+'/'
files = os.listdir(feature_dir)
pattern = '.*all_450k.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]

In [80]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

317
31
735
303
73
267
80


In [81]:
rename_features(all_sites)

In [82]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [83]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'all_450k_addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [84]:
#nearest tss distance 
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [87]:
all_sites

Unnamed: 0,id,chr,coordinate,pvalue,beta,winid,A549-ATF3,A549-BCL3,A549-CEBPB,A549-CREB1,...,NCFF795DNO_WGBS_counts,NCFF801OHX_WGBS_counts,NCFF811QOG_WGBS_counts,NCFF831OYO_WGBS_counts,NCFF843SYR_WGBS_counts,NCFF847OWL_WGBS_counts,NCFF874GGB_WGBS_counts,NCFF913ZNZ_WGBS_counts,NCFF923CZC_WGBS_counts,dist_to_nearest_tss
0,cg13869341,1,15865.0,0.239378,0.877442,80.0,0.001230,0.003159,0.000000,0.000000,...,0.000000,0.014806,0.000000,0.040698,0.000000,0.000000,0.028351,0.003995,0.000000,0.001643
1,cg24669183,1,534242.0,0.717648,0.786274,2672.0,0.001844,0.003159,0.000420,0.000054,...,0.000000,0.018223,0.035714,0.023256,0.008219,0.008403,0.023196,0.015979,0.072727,0.036137
2,cg15560884,1,710097.0,0.716762,0.652625,3551.0,0.000922,0.000862,0.000000,0.000109,...,0.057692,0.082005,0.188776,0.186047,0.075799,0.159664,0.046392,0.082557,0.127273,0.001635
3,cg01014490,1,714177.0,0.311382,0.008987,3571.0,0.007378,0.006893,0.001679,0.128999,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000045
4,cg17505339,1,720865.0,0.093356,0.859844,3605.0,0.000615,0.000000,0.000000,0.000000,...,0.134615,0.005695,0.071429,0.098837,0.031963,0.096639,0.077320,0.027963,0.096970,0.002798
5,cg11954957,1,758829.0,0.822191,0.816452,3795.0,0.000922,0.000574,0.000000,0.000000,...,0.057692,0.061503,0.066327,0.191860,0.017352,0.130252,0.025773,0.017310,0.042424,0.001677
6,cg23803172,1,763119.0,0.882293,0.004858,3816.0,0.003996,0.002872,0.000000,0.001360,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000024
7,cg16736630,1,779995.0,0.158965,0.758187,3900.0,0.000615,0.000287,0.000000,0.000109,...,0.038462,0.080866,0.058673,0.075581,0.014612,0.109244,0.082474,0.019973,0.054545,0.006922
8,cg05898754,1,805102.0,0.525479,0.359355,4026.0,0.001844,0.001436,0.002099,0.000544,...,0.019231,0.007973,0.012755,0.017442,0.002740,0.000000,0.020619,0.002663,0.012121,0.002914
9,cg03128332,1,805338.0,0.666332,0.142779,4027.0,0.003996,0.002872,0.000840,0.002775,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002817


In [86]:
with pd.HDFStore(all_450_features,'w') as h5s:
    h5s['all_450k_features'] = all_sites.drop(['pvalue','beta'],axis=1)