In [1]:
import sys
import pandas as pd
import numpy as np
from common import commons 
home = commons.home
from log import Logger

In [2]:
def cal_beta(beta_file,pos_file):
    betas = pd.read_csv(beta_file,sep='\s+',index_col=['TargetID'])
    mean_betas = pd.DataFrame(betas.mean(axis=1),columns=['beta'])
    mean_betas.index = betas.index
    pos = pd.read_csv(pos_file,sep='\s+',usecols=[0,2,3],index_col=0, header=None,skiprows=1,names=['id','chr','coordinate'])
    beta_pos = mean_betas.join(pos)
    return beta_pos

In [3]:

log_dir = home+'logs/'
logger = Logger.Logger(log_dir).get_logger()
beta_file = home+'data/AD_CpG/ROSMAP_arrayMethylation_imputed.tsv'
pos_file = home+'data/AD_CpG/ROSMAP_arrayMethylation_metaData.tsv'
all_sites_betas = cal_beta(beta_file,pos_file)

In [4]:
all_sites_betas

Unnamed: 0_level_0,beta,chr,coordinate
TargetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg00000165,0.154822,1,91194674
cg00000363,0.131544,1,230560793
cg00000957,0.779174,1,5937253
cg00001349,0.893716,1,166958439
cg00001364,0.727746,1,214170376
cg00001446,0.843326,1,43831041
cg00001534,0.882839,1,51034865
cg00001583,0.073302,1,200011786
cg00001593,0.927707,1,170490434
cg00002028,0.023826,1,20960010


In [5]:
pos_pvalue = 0.0001 ### 0.001 for amyloid, 0.0001 for cerad, 0.00001 for tangles
neg_pvalue = 0.3
sample_ratio_neg_to_pos = 10

In [6]:
type_name = commons.type_name  ## amyloid, cerad, tangles
with_cell_type = commons.with_cell_type ## with or without
all_sites_file = home+'data/AD_CpG/Rosmap_'+type_name+'_ewas_'+with_cell_type+'celltype.csv'
all_sites = pd.read_csv(all_sites_file,usecols=[1,2,3],header=None,skiprows=1,index_col=0,names=['id','beta_sign','pvalue'])
all_sites = all_sites.join(all_sites_betas).dropna()
all_sites.reset_index(inplace=True)
temp = pd.DataFrame()
temp['id'],temp['chr'],temp['coordinate'],temp['beta_sign'],temp['pvalue'],temp['beta'] = all_sites['id'],all_sites['chr'],all_sites['coordinate'],all_sites['beta_sign'],all_sites['pvalue'],all_sites['beta']
all_sites = temp
all_sites['chr'] = all_sites['chr'].astype('i8')
all_sites.sort_values(['pvalue'],inplace=True,ascending=True)
positive_sites = all_sites.query('pvalue<=@pos_pvalue')
positive_sites['label'] = np.where(positive_sites['beta_sign']>0,1,-1)
negative_sites = all_sites.query('pvalue>@neg_pvalue')
negative_sites['label'] = 0
negatives_sort_by_beta = negative_sites.sort_values(['beta'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
positive_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label
0,cg11823178,8,41519399.0,-6.164209,1.470000e-09,0.919399,-1
1,cg05810363,17,74475270.0,-5.673507,2.380000e-08,0.935675,-1
2,cg23968456,10,73521631.0,-5.634362,2.950000e-08,0.968990,-1
3,cg13390284,1,65531864.0,-5.218001,2.660000e-07,0.128000,-1
4,cg14622549,12,132549292.0,-5.217150,2.670000e-07,0.890136,-1
5,cg12309456,17,74475402.0,-5.209013,2.790000e-07,0.938982,-1
6,cg05066959,8,41519308.0,-5.205684,2.840000e-07,0.865159,-1
7,cg13076843,17,74475294.0,-5.195748,2.980000e-07,0.884712,-1
8,cg25018458,17,980014.0,-5.088175,5.140000e-07,0.960097,-1
9,cg25285237,1,43296491.0,-5.057133,6.000000e-07,0.897580,-1


In [8]:
negative_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label
146360,cg23470227,17,73824760.0,-1.037507,0.300005,0.770308,0
146361,cg12291059,18,19997924.0,-1.037507,0.300005,0.644726,0
146362,cg19641404,2,232826272.0,-1.037497,0.300010,0.016544,0
146363,cg04790761,1,206858062.0,-1.037496,0.300010,0.049845,0
146364,cg14777601,19,51612175.0,-1.037487,0.300014,0.041893,0
146365,cg08031955,1,109655406.0,-1.037477,0.300019,0.058474,0
146366,cg21035755,4,26585721.0,-1.037471,0.300022,0.060220,0
146367,cg12157788,7,4389134.0,-1.037470,0.300023,0.792992,0
146368,cg18156192,1,112320833.0,-1.037468,0.300023,0.799977,0
146369,cg15689180,1,23897764.0,1.037468,0.300024,0.931430,0


In [12]:
select_negs_list = []
hyper_sites = negatives_sort_by_beta.query('beta_sign>=0')
hypo_sites = negatives_sort_by_beta.query('beta_sign<0')
for beta,beta_sign in positive_sites[['beta','beta_sign']].values:
    tmp_sites = hyper_sites if beta_sign >=0 else hypo_sites
    neg_ix = tmp_sites['beta'].searchsorted(beta)[0]    
    negs = tmp_sites.iloc[neg_ix-int(sample_ratio_neg_to_pos/2):np.minimum(neg_ix+int(sample_ratio_neg_to_pos/2),len(negatives_sort_by_beta)),:]
    select_negs_list.extend(negs.values)
select_negs = pd.DataFrame(select_negs_list,columns=['id','chr','coordinate','beta_sign','pvalue','beta','label'])

In [13]:
win_path = home+'wins.txt'
pos_sites_with_winid, neg_sites_with_winid = commons.merge_with_feature_windows(win_path,positive_sites,select_negs)
all_sites_with_winid = pos_sites_with_winid.append(neg_sites_with_winid,ignore_index=True)
all_sites_with_winid.drop_duplicates(['id'],inplace=True)
all_sites_with_winid.sort_values(['chr','coordinate'],inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['chr'] = data['chr'].astype('i8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  feature_wins.sort_values(['chr','start'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  pos_sites.sort_values(['chr','coordinate'],inplace=Tru

In [16]:
#export all features to file
with pd.HDFStore(home+'data/AD_CpG/'+type_name+with_cell_type+'/all_sites_winid','w') as h5s:
    h5s['all_sites_winid'] = all_sites_with_winid       
all_sites_with_winid.to_csv(home+'data/AD_CpG/'+type_name+with_cell_type+'/all_sites_winid.csv',index=False)  
all_sites_with_winid['winid'].to_csv(home+'data/AD_CpG/'+type_name+with_cell_type+'/selected_pos_winid.csv',index=False)

In [14]:
all_sites_with_winid

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,start,winid,end
123,cg08750554,1,1005100.0,0.460842,6.451141e-01,0.221929,0,1005001,5026,1005200
124,cg13362546,1,1102960.0,0.582537,5.604699e-01,0.780087,0,1102801,5515,1103000
0,cg22373622,1,1489544.0,4.490939,8.830000e-06,0.918745,1,1489401,7448,1489600
1,cg16926213,1,1841314.0,5.065649,5.750000e-07,0.372401,1,1841201,9207,1841400
125,cg12656307,1,2349734.0,0.861177,3.895565e-01,0.401944,0,2349601,11749,2349800
126,cg00373616,1,2434487.0,0.220757,8.253727e-01,0.769612,0,2434401,12173,2434600
127,cg25618424,1,2989307.0,0.507053,6.123429e-01,0.509518,0,2989201,14947,2989400
128,cg09119863,1,3129238.0,0.435539,6.633605e-01,0.697770,0,3129201,15647,3129400
129,cg01079872,1,3302477.0,0.678813,4.975729e-01,0.904952,0,3302401,16513,3302600
130,cg00571809,1,3397113.0,0.234530,8.146707e-01,0.947405,0,3397001,16986,3397200


In [15]:
positives = commons.sample_weights(all_sites_with_winid,all_sites_with_winid['label'],factor=1.5)

91.71966811108618