In [1]:
import sys
import pandas as pd
import numpy as np
from common import commons 
home = commons.home
from log import Logger
import os

In [2]:
def cal_beta(beta_file,pos_file):
    betas = pd.read_csv(beta_file,sep='\s+',index_col=['TargetID'])
    mean_betas = pd.DataFrame(betas.mean(axis=1),columns=['beta'])
    mean_betas.index = betas.index
    pos = pd.read_csv(pos_file,sep='\s+',usecols=[0,2,3],index_col=0, header=None,skiprows=1,names=['id','chr','coordinate'])
    beta_pos = mean_betas.join(pos)
    return beta_pos

In [4]:
log_dir = home+'logs/'
logger = Logger.Logger(log_dir).get_logger()
beta_file = home+'data/AD_CpG/ROSMAP_arrayMethylation_imputed.tsv'
pos_file = home+'data/AD_CpG/ROSMAP_arrayMethylation_metaData.tsv'
all_sites_betas = cal_beta(beta_file,pos_file)

In [5]:
all_sites_betas

Unnamed: 0_level_0,beta,chr,coordinate
TargetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg00000165,0.154822,1,91194674
cg00000363,0.131544,1,230560793
cg00000957,0.779174,1,5937253
cg00001349,0.893716,1,166958439
cg00001364,0.727746,1,214170376
cg00001446,0.843326,1,43831041
cg00001534,0.882839,1,51034865
cg00001583,0.073302,1,200011786
cg00001593,0.927707,1,170490434
cg00002028,0.023826,1,20960010


In [6]:
pos_pvalue = 0.0002 ### 0.001 for amyloid, 0.0001 for cerad, 0.00001 for tangles,0.002 for cogdec, 0.0002 for gpath,0.0002 for braak
neg_pvalue = 0.4
sample_ratio_neg_to_pos = 10

In [7]:
type_name = commons.type_name  ## amyloid, cerad, tangles
with_cell_type = commons.with_cell_type ## with or without
all_sites_file = home+'data/AD_CpG/Rosmap_'+type_name+'_ewas_'+with_cell_type+'celltype.csv'
all_sites = pd.read_csv(all_sites_file,usecols=[1,2,3],header=None,skiprows=1,index_col=0,names=['id','beta_sign','pvalue'])
all_sites = all_sites.join(all_sites_betas).dropna()
all_sites.reset_index(inplace=True)
temp = pd.DataFrame()
temp['id'],temp['chr'],temp['coordinate'],temp['beta_sign'],temp['pvalue'],temp['beta'] = all_sites['id'],all_sites['chr'],all_sites['coordinate'],all_sites['beta_sign'],all_sites['pvalue'],all_sites['beta']
all_sites = temp
all_sites['chr'] = all_sites['chr'].astype('i8')
all_sites.sort_values(['pvalue'],inplace=True,ascending=True)
positive_sites = all_sites.query('pvalue<=@pos_pvalue')
positive_sites['label'] = np.where(positive_sites['beta_sign']>0,1,-1)
negative_sites = all_sites.query('pvalue>@neg_pvalue')
negative_sites['label'] = 0
negatives_sort_by_beta = negative_sites.sort_values(['beta'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
positive_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label
78298,cg01824948,14,102057263.0,25.162988,7.327283e-07,0.962675,1
94226,cg11629889,17,40839022.0,25.127508,7.456763e-07,0.547185,1
87588,cg05507697,16,89421700.0,21.750419,3.988525e-06,0.841004,1
97986,cg25018458,17,980014.0,21.616622,4.264255e-06,0.960097,1
6919,cg17029193,1,8510607.0,21.327362,4.927823e-06,0.676125,1
35305,cg02578944,6,28227392.0,21.088274,5.554223e-06,0.557166,1
30991,cg09542997,5,179621502.0,21.054452,5.649092e-06,0.938446,1
49010,cg25594100,7,4786943.0,21.034811,5.704932e-06,0.780079,1
83040,cg12067522,15,52405000.0,19.925303,9.955961e-06,0.219051,1
17683,cg22883290,2,127800646.0,19.633284,1.153207e-05,0.966171,1


In [9]:
negative_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label
154697,cg13030331,2,241497599.0,7.095353e-01,0.400000,0.170406,0
295305,cg15953187,11,1256378.0,7.095239e-01,0.400004,0.853170,0
102660,cg08610201,19,1491082.0,7.095121e-01,0.400008,0.022900,0
65472,cg13324568,11,61197484.0,7.095093e-01,0.400009,0.003710,0
160687,cg21116021,2,131422514.0,7.094990e-01,0.400012,0.800794,0
177546,cg19981243,3,114478256.0,7.094933e-01,0.400014,0.476370,0
251106,cg01715818,8,54507532.0,7.094910e-01,0.400015,0.895642,0
392875,cg06362543,20,42135794.0,7.094831e-01,0.400017,0.494833,0
353726,cg10511186,16,801394.0,7.094804e-01,0.400018,0.662781,0
217006,cg08321824,6,165831916.0,7.094800e-01,0.400018,0.927070,0


In [10]:
select_negs_list = []
hyper_sites = negatives_sort_by_beta.query('beta_sign>=0')
hypo_sites = negatives_sort_by_beta.query('beta_sign<0')
for beta,beta_sign in positive_sites[['beta','beta_sign']].values:
    tmp_sites = hyper_sites if beta_sign >=0 else hypo_sites
    neg_ix = tmp_sites['beta'].searchsorted(beta)[0]    
    negs = tmp_sites.iloc[neg_ix-int(sample_ratio_neg_to_pos/2):np.minimum(neg_ix+int(sample_ratio_neg_to_pos/2),len(negatives_sort_by_beta)),:]
    select_negs_list.extend(negs.values)
select_negs = pd.DataFrame(select_negs_list,columns=['id','chr','coordinate','beta_sign','pvalue','beta','label'])

In [11]:
win_path = home+'data/commons/wins.txt'
pos_sites_with_winid, neg_sites_with_winid = commons.merge_with_feature_windows(win_path,positive_sites,select_negs)
all_sites_with_winid = pos_sites_with_winid.append(neg_sites_with_winid,ignore_index=True)
all_sites_with_winid.drop_duplicates(['id'],inplace=True)
all_sites_with_winid.sort_values(['chr','coordinate'],inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['chr'] = data['chr'].astype('i8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  feature_wins.sort_values(['chr','start'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  pos_sites.sort_values(['chr','coordinate'],inplace=Tru

In [12]:
#export all features to file
if not os.path.exists(home+'data/AD_CpG/'+type_name+with_cell_type):
    os.mkdir(home+'data/AD_CpG/'+type_name+with_cell_type)
with pd.HDFStore(home+'data/AD_CpG/'+type_name+with_cell_type+'/all_sites_winid','w') as h5s:
    h5s['all_sites_winid'] = all_sites_with_winid       
all_sites_with_winid.to_csv(home+'data/AD_CpG/'+type_name+with_cell_type+'/all_sites_winid.csv',index=False)  
all_sites_with_winid['winid'].to_csv(home+'data/AD_CpG/'+type_name+with_cell_type+'/selected_pos_winid.csv',index=False)

In [13]:
all_sites_with_winid

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,start,winid,end
113,cg25605174,1,1106175.0,0.179798,0.671729,0.828180,0,1106001,5531,1106200
114,cg12286462,1,1358109.0,0.260215,0.610197,0.794203,0,1358001,6791,1358200
115,cg24470133,1,1483206.0,0.002705,0.958542,0.960068,0,1483201,7417,1483400
0,cg22373622,1,1489544.0,16.545792,0.000055,0.918745,1,1489401,7448,1489600
116,cg00007800,1,1838518.0,0.280518,0.596597,0.794176,0,1838401,9193,1838600
117,cg06801943,1,2101479.0,0.538160,0.463540,0.645085,0,2101401,10508,2101600
1,cg18934822,1,2191402.0,15.202460,0.000110,0.378193,1,2191401,10958,2191600
118,cg19694770,1,2407304.0,0.668186,0.414073,0.923682,0,2407201,12037,2407400
119,cg18123385,1,2938122.0,0.033714,0.854392,0.575633,0,2938001,14691,2938200
120,cg14008571,1,3038564.0,0.368172,0.544277,0.702007,0,3038401,15193,3038600


In [14]:
positives = commons.sample_weights(all_sites_with_winid,all_sites_with_winid['label'],factor=1.5)

In [29]:
from importlib import reload
reload(commons)

<module 'common.commons' from '/home/ec2-user/volume/git/EnsembleCpG/code/common/commons.py'>

In [30]:
##export winid with all 450k sites
type_name = commons.type_name  ## amyloid, cerad, tangles
with_cell_type = commons.with_cell_type ## with or without
all_sites_file = home+'data/AD_CpG/Rosmap_'+type_name+'_ewas_'+with_cell_type+'celltype.csv'
all_sites = pd.read_csv(all_sites_file,usecols=[1,2,3],header=None,skiprows=1,index_col=0,names=['id','beta_sign','pvalue'])
all_sites = all_sites.join(all_sites_betas).dropna()
all_sites.reset_index(inplace=True)
temp = pd.DataFrame()
temp['id'],temp['chr'],temp['coordinate'],temp['beta_sign'],temp['pvalue'],temp['beta'] = all_sites['id'],all_sites['chr'],all_sites['coordinate'],all_sites['beta_sign'],all_sites['pvalue'],all_sites['beta']
all_sites = temp
all_sites['chr'] = all_sites['chr'].astype('i8')
all_sites.sort_values(['pvalue'],inplace=True,ascending=True)

In [31]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta
49010,cg25594100,7,4786943.0,5.615169,3.258135e-08,0.780079
129485,cg13390284,1,65531864.0,5.032855,6.748768e-07,0.128000
51592,cg11823178,8,41519399.0,5.000947,7.904325e-07,0.919399
50455,cg05066959,8,41519308.0,4.962907,9.532788e-07,0.865159
124441,cg08706567,1,43814983.0,4.784900,2.254117e-06,0.121573
75416,cg00931644,13,77461368.0,4.743859,2.738557e-06,0.232159
48365,cg22962123,7,27153605.0,4.712061,3.181293e-06,0.264542
4774,cg11839415,1,43814764.0,4.678513,3.722806e-06,0.183050
61024,cg23968456,10,73521631.0,4.610318,5.109414e-06,0.968990
44393,cg07180538,7,4786899.0,4.585698,5.722642e-06,0.868481


In [32]:
win_path = home+'data/commons/wins.txt'
all_450k_sites_with_winid, __ = commons.merge_with_feature_windows(win_path,all_sites)
all_450k_sites_with_winid.drop(['beta_sign'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['chr'] = data['chr'].astype('i8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  feature_wins.sort_values(['chr','start'],inplace=True)


In [33]:
all_450k_sites_with_winid

Unnamed: 0,id,chr,coordinate,pvalue,beta,start,winid,end
0,cg13869341,1,15865.0,0.926372,0.877442,15801,80,16000
1,cg24669183,1,534242.0,0.943379,0.786274,534201,2672,534400
2,cg15560884,1,710097.0,0.465516,0.652625,710001,3551,710200
3,cg01014490,1,714177.0,0.853347,0.008987,714001,3571,714200
4,cg17505339,1,720865.0,0.052664,0.859844,720801,3605,721000
5,cg11954957,1,758829.0,0.160539,0.816452,758801,3795,759000
6,cg23803172,1,763119.0,0.452868,0.004858,763001,3816,763200
7,cg16736630,1,779995.0,0.128003,0.758187,779801,3900,780000
8,cg05898754,1,805102.0,0.671161,0.359355,805001,4026,805200
9,cg03128332,1,805338.0,0.112397,0.142779,805201,4027,805400


In [34]:
with pd.HDFStore(home+'data/AD_CpG/'+type_name+with_cell_type+'/all_450k_sites_winid','w') as h5s:
    h5s['all_450k_sites_winid'] = all_450k_sites_with_winid      
all_450k_sites_with_winid.to_csv(home+'data/AD_CpG/'+type_name+with_cell_type+'/all_450k_sites_winid.csv',index=False)  
all_450k_sites_with_winid['winid'].to_csv(home+'data/AD_CpG/selected_450k_pos_winid.csv',index=False)