In [1]:
import os
import sys
from common import commons
home = commons.home
extra_storage = commons.extra_storage
from features_preprocess import BED_binning
from features_preprocess import BED_Preprocess, CADD_Preprocess,DANN_Preprocess,Eigen_Preprocess,GenoCanyon_Preprocess
import subprocess
import pandas as pd
from features_preprocess import get_winid
import prediction_commons
import numpy as np

In [2]:
def wgbs_sites_selection(tss,allsites):
    tss = tss.sort_values(['chr','coordinate'])
    i = 0
    selected_sites = []
    #selected_sites = pd.DataFrame(columns=['chr','coordinate','tss_coordinate'])
    tss['before'] = tss['coordinate']-100000
    tss['after'] = tss['coordinate']+100000
    for row in allsites.iterrows():
        if i >= len(tss):
            break
        chr = row[1]['chr']
        coordinate = row[1]['coordinate']
        winid = row[1]['winid']
        if chr==tss.ix[i,'chr'] and coordinate>=tss.ix[i,'before'] and coordinate<=tss.ix[i,'after']:
            selected_sites.extend([[winid,chr,coordinate,tss.ix[i,'chr'],tss.ix[i,'coordinate']]])
        else:
            while  i<len(tss) and (chr>tss.ix[i,'chr'] or (chr==tss.ix[i,'chr'] and coordinate>tss.ix[i,'after'])):
                i += 1
            if i<len(tss) and chr==tss.ix[i,'chr'] and coordinate>=tss.ix[i,'before'] and coordinate<=tss.ix[i,'after']:
                selected_sites.extend([[winid,chr,coordinate,tss.ix[i,'chr'],tss.ix[i,'coordinate']]])
    return pd.DataFrame(selected_sites,columns=['winid','chr','coordinate','tss_chr','tss_coordinate'])


def nearest_tss(tss,sites_df):
    merged = pd.merge(sites_df,tss,how='outer',on=['chr','coordinate'])
    merged.sort_values(['chr','coordinate'],inplace=True)
    merged.rename(columns={'strand':'before_tss'},inplace=True)
    merged.ix[merged['before_tss'].isnull()==False, 'before_tss'] = merged.ix[merged['before_tss'].isnull()==False,'coordinate']
    merged['after_tss'] = merged['before_tss']
    merged['before_tss'].fillna(method='ffill', inplace=True)
    merged['after_tss'].fillna(method='bfill',inplace=True)
    merged['dist_to_before_tss'] = np.abs(merged['coordinate']-merged['before_tss'])
    merged['dist_to_after_tss'] = np.abs(merged['coordinate']-merged['after_tss'])
    merged['tss'] = None
    before_ix = (merged['dist_to_before_tss'] < merged['dist_to_after_tss']) | (merged['dist_to_after_tss'].isnull())
    merged.ix[before_ix,'tss'] = merged.ix[before_ix,'before_tss']
    after_ix = (merged['dist_to_before_tss'] >= merged['dist_to_after_tss']) | (merged['dist_to_before_tss'].isnull())
    merged.ix[after_ix,'tss'] = merged.ix[after_ix,'after_tss']
    merged['dist_to_nearest_tss'] = np.abs(merged['coordinate']-merged['tss']) 
    merged.dropna(axis=0,subset=['id'],inplace=True)
    return merged

def rename_features(x):   #rename repetitive features
    features = np.array(x.columns)
    features_count = pd.Series(index=x.columns.unique())
    features_count = features_count.fillna(int(0))
    for i,name in enumerate(x.columns):
        if features_count[name] == 0:
            features_count[name] += 1
        else:
            features[i] = name+str(features_count[name])
            features_count[name] += 1
    x.columns = features
    return 

def read_WGBS(file):
    bed = pd.read_csv(file,usecols=[0,1,2,5,9,10],header=None,names=['chr','pos1','pos2','strand','total','percent'],sep='\s+')
    bed['coordinate'] = np.where(bed['strand']=='+',bed['pos1'],bed['pos2'])
    bed.drop(['pos1','pos2'],axis=1,inplace=True)
    bed['count'] = np.round(bed['total']*bed['percent']/100.0)
    bed.drop(['total','percent'],axis=1,inplace=True) 
    #    bed_counts = bed.groupby(['chr','coordinate']).aggregate({'count':sum})
    return bed

In [5]:
###get all WGBS sites
dataset = 'WGBS'
data_dir = extra_storage+'WGBS/'
file = data_dir+'ENCFF844EFX.bed'
chrs=np.arange(1,22,dtype='int64')
wgbs_file = home+'data/'+dataset+'/WGBS.csv'
bed = read_WGBS(file)
bed = get_winid.convert_chr_to_num(bed,chrs).sort_values(['chr','coordinate'])
bed.to_csv(wgbs_file,columns=['chr','coordinate','count'],index=False)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21']


In [6]:
bed

Unnamed: 0,chr,strand,coordinate,count
0,1,+,10468,1.0
1,1,-,10470,0.0
2,1,+,10470,3.0
3,1,-,10472,0.0
4,1,+,10483,1.0
5,1,-,10485,0.0
6,1,+,10488,1.0
7,1,-,10490,0.0
8,1,+,10492,3.0
9,1,-,10494,0.0


In [6]:
#using WGBS(hg19) sites 

win_path='/home/ec2-user/CpGPython/data/wins.txt'
wins = get_winid.read_wins(win_path,chrs)
hg19_wgbs_file = home+'data/'+dataset+'/hg19_WGBS.csv'
hg19_wgbs = pd.read_csv(hg19_wgbs_file).sort_values(['chr','coordinate'])
hg19_wgbs['start'] = (hg19_wgbs['coordinate']/200.0).apply(lambda x: int(np.ceil(x-1))*200+1)
all_sites = pd.merge(wgbs,wins, on=['chr','start'],how='left')
all_wgbs_sites_file = home+'data/'+dataset+'/all_wgbs_sites_winid.csv'
all_sites.to_csv(all_wgbs_sites_file)
all_sites['winid'] = all_sites['winid'].astype('i8')

In [36]:
all_sites

Unnamed: 0,chr,coordinate,winid,start,end
0,1,10468,53,10401,10600.0
1,1,10470,53,10401,10600.0
2,1,10470,53,10401,10600.0
3,1,10472,53,10401,10600.0
4,1,10483,53,10401,10600.0
5,1,10485,53,10401,10600.0
6,1,10488,53,10401,10600.0
7,1,10490,53,10401,10600.0
8,1,10492,53,10401,10600.0
9,1,10494,53,10401,10600.0


In [7]:
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


In [32]:
selected_wgbs_tss = wgbs_sites_selection(tss,all_sites)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [25]:
with pd.HDFStore(home+'data/'+dataset+'/all_selected_wgbs_sites','w') as h5s:
    h5s['all_wgbs'] = selected_wgbs_tss

Unnamed: 0,chr,coordinate,tss_chr,tss_coordinate
0,1.0,10468.0,1,11874
1,1.0,10608.0,1,11874
2,1.0,10810.0,1,11874
3,1.0,13078.0,1,11874
4,1.0,13215.0,1,11874
5,1.0,13416.0,1,11874
6,1.0,13643.0,1,11874
7,1.0,13822.0,1,11874
8,1.0,14348.0,1,11874
9,1.0,14434.0,1,11874


In [None]:
###split into batchs of CpG around tss sites
start = prediction_commons.tss_start
end = prediction_commons.tss_end
selected_wgbs = selected_wgbs_tss[start:end]
sites_file = home+'data/'+dataset+'/all_sites_winid.csv'
selected_wgbs.to_csv(sites_file,index=False)
selected_wgbs.to_csv(home+'data/'+dataset+'/selected_pos_winid.csv',columns=['winid'],index=False,header=None)
additional_feature_file = home+'data/features/'+dataset+'/addtional_features_'+str(start)+'_'+str(end)

In [35]:
selected_wgbs_tss

Unnamed: 0,winid,chr,coordinate,tss_chr,tss_coordinate
0,53.0,1.0,10468.0,1,11874
1,54.0,1.0,10608.0,1,11874
2,55.0,1.0,10810.0,1,11874
3,66.0,1.0,13078.0,1,11874
4,67.0,1.0,13215.0,1,11874
5,68.0,1.0,13416.0,1,11874
6,69.0,1.0,13643.0,1,11874
7,70.0,1.0,13822.0,1,11874
8,72.0,1.0,14348.0,1,11874
9,73.0,1.0,14434.0,1,11874


In [5]:
subprocess.call([home+'code/features_preprocess/Feature_export.R',dataset])

In [None]:

if os.path.exists(WGBS_h5s):
    WGBS_process = BED_Preprocess.BED_Preprocessing(h5s_file=WGBS_h5s,sites_file=sites_file,additional_feature_file=additional_feature_file,data_type='WGBS')
    WGBS_process.process()
else:
    WGBS_binning = BED_binning.BED_binning(data_type='WGBS',data_dir=extra_storage+'WGBS/',output=WGBS_h5s,sorted=True)
    WGBS_binning.binning()
    WGBS_process = BED_Preprocess.BED_Preprocessing(h5s_file=WGBS_h5s,sites_file=sites_file,additional_feature_file=additional_feature_file,data_type='WGBS')
    WGBS_process.process()

In [None]:
ATAC_h5s = home+'data/ATAC_H5S'
if os.path.exists(ATAC_h5s):
    atac_process = BED_Preprocess.BED_Preprocessing(h5s_file=ATAC_h5s,sites_file=sites_file,additional_feature_file=additional_feature_file,data_type='ATAC')
    atac_process.process()
else:
    atac_binning = BED_binning.BED_binning(data_type='ATAC',data_dir=extra_storage+'ATAC/',output=ATAC_h5s,sorted=True)
    atac_binning.binning()
    atac_process = BED_Preprocess.BED_Preprocessing(h5s_file=ATAC_h5s,sites_file=sites_file,additional_feature_file=additional_feature_file,data_type='ATAC')
    atac_process.process()  

In [None]:
RNASeq_h5s = home+'data/RNASeq/'
if len(os.listdir(RNASeq_h5s))>0:
    rnaseq_process = BED_Preprocess.BED_Preprocessing(h5s_file=RNASeq_h5s,sites_file=sites_file,additional_feature_file=additional_feature_file, data_type='RNASeq')
    rnaseq_process.process()
else:
    subprocess.call(['python',home+'code/feature_preprocess/RNASeq_binning.py'])
    rnaseq_process = BED_Preprocess.BED_Preprocessing(h5s_file=RNASeq_h5s,sites_file=sites_file,additional_feature_file=additional_feature_file, data_type='RNASeq')
    rnaseq_process.process()

In [None]:
cadd_preprocess = CADD_Preprocess.CADD_Preprocess(sites_file=sites_file,additional_feature_file=additional_feature_file)
cadd_preprocess.process()

In [None]:
dann_preprocess = DANN_Preprocess.DANN_Preprocess(sites_file=sites_file,additional_feature_file=additional_feature_file)
dann_preprocess.process()

In [None]:
eigen_preprocess = Eigen_Preprocess.Eigen_Preprocess(sites_file=sites_file,additional_feature_file=additional_feature_file)
eigen_preprocess.process()

In [None]:
genocanyon_scores = extra_storage+'GenoCanyon/Results/'+dataset+'/selected_site_scores.txt'
data_dir=extra_storage+'GenoCanyon/Results/'+dataset+'/'
if os.path.exists(genocanyon_scores):
    genocanyon_preprocess = GenoCanyon_Preprocess.GenoCanyon_Preprocess(data_dir=data_dir,sites_file=sites_file,additional_feature_file=additional_feature_file)
    genocanyon_preprocess.process()
else:
    print('Run GenoCanyon R script first')

In [11]:
feature_dir = home+'data/features/'+dataset+'/'
files = os.listdir(feature_dir)
pattern = '.*all.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]

Unnamed: 0,chr,coordinate,winid,start,end
0,1,10468,53,10401,10600.0
1,1,10470,53,10401,10600.0
2,1,10470,53,10401,10600.0
3,1,10472,53,10401,10600.0
4,1,10483,53,10401,10600.0
5,1,10485,53,10401,10600.0
6,1,10488,53,10401,10600.0
7,1,10490,53,10401,10600.0
8,1,10492,53,10401,10600.0
9,1,10494,53,10401,10600.0


In [None]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    selected_wgbs = pd.concat([selected_wgbs,feature],axis=1)

In [None]:
rename_features(selected_wgbs)
selected_wgbs.shape

In [None]:
selected_wgbs.drop(['start','end'],axis=1,inplace=True)

In [None]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        selected_wgbs = pd.concat([selected_wgbs,feature_frame],axis=1)
selected_wgbs = selected_wgbs.loc[:,~selected_wgbs.columns.duplicated()]
selected_wgbs['chr'] = selected_wgbs['chr'].astype('i8')

In [None]:
#nearest tss distance    
chrs = selected_wgbs['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
selected_wgbs = nearest_tss(tss,selected_wgbs)
selected_wgbs.drop(['before_tss','after_tss','tss','dist_to_before_tss','dist_to_after_tss'],axis=1,inplace=True)

In [None]:
with pd.HDFStore(home+'data/'+dataset+'/all_features_'+str(start)+'_'+str(end),'w') as h5s:
    h5s['all_features'] = selected_wgbs