In [1]:
import os
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(100)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib as mpl
mpl.rcParams['figure.facecolor'] = (1,1,1,1)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

pd.set_option('display.max_columns', 100)

In [2]:
df_ref = pd.read_csv('reference_atlas.csv')
df_ref.head()

Unnamed: 0,CpGs,Monocytes_EPIC,B-cells_EPIC,CD4T-cells_EPIC,NK-cells_EPIC,CD8T-cells_EPIC,Neutrophils_EPIC,Erythrocyte_progenitors,Adipocytes,Cortical_neurons,Hepatocytes,Lung_cells,Pancreatic_beta_cells,Pancreatic_acinar_cells,Pancreatic_duct_cells,Vascular_endothelial_cells,Colon_epithelial_cells,Left_atrium,Bladder,Breast,Head_and_neck_larynx,Kidney,Prostate,Thyroid,Upper_GI,Uterus_cervix
0,cg08169020,0.8866,0.2615,0.0149,0.0777,0.0164,0.868,0.9509,0.0336,0.0168,0.034,0.0416,0.038875,0.0209,0.013,0.0323,0.0163,0.0386,0.0462,0.0264,0.047,0.0269,0.0353,0.0553,0.0701,0.0344
1,cg25913761,0.8363,0.221,0.2816,0.4705,0.3961,0.8293,0.2385,0.3578,0.3104,0.2389,0.225,0.132,0.2249,0.1996,0.3654,0.2037,0.2446,0.2054,0.1922,0.2045,0.1596,0.1557,0.1848,0.168,0.2026
2,cg26955540,0.7658,0.0222,0.1492,0.4005,0.3474,0.7915,0.1374,0.1965,0.0978,0.0338,0.0768,0.041725,0.0314,0.0139,0.2382,0.0193,0.1134,0.1269,0.1651,0.1523,0.1034,0.0686,0.0943,0.1298,0.1075
3,cg25170017,0.8861,0.5116,0.1021,0.4363,0.0875,0.7042,0.9447,0.0842,0.2832,0.2259,0.0544,0.11175,0.0309,0.0217,0.0972,0.0187,0.0674,0.0769,0.0691,0.0704,0.0604,0.0369,0.0412,0.0924,0.0697
4,cg12827637,0.5212,0.3614,0.0227,0.212,0.0225,0.5368,0.4667,0.0287,0.1368,0.0307,0.1607,0.065975,0.037,0.023,0.0798,0.0193,0.0432,0.0459,0.0228,0.0687,0.0234,0.0508,0.0726,0.0759,0.0196


In [3]:
df_example = pd.read_csv('examples.csv')
df_example.head()

Unnamed: 0,CpGs,Monocytes90_Breast10,Adipo60Kidney30Prost10,Eryth50Lung25GI25
0,cg08169020,0.80058,0.03176,0.503375
1,cg25913761,0.77189,0.27813,0.2175
2,cg26955540,0.70573,0.15578,0.12035
3,cg25170017,0.8044,0.07233,0.50905
4,cg12827637,0.47136,0.02932,0.2925


In [4]:
df_EPIC = pd.read_csv('http://zwdzwd.io/InfiniumAnnotation/current/EPIC/EPIC.hg38.manifest.tsv.gz',
                      sep='\t')
df_EPIC = df_EPIC.dropna(subset=['CpG_beg', 'CpG_end'])
df_EPIC['pos'] = df_EPIC[['CpG_beg', 'CpG_end']].mean(axis=1).astype(int).astype(str)
df_EPIC['posID'] = df_EPIC['CpG_chrm'] + ':' + df_EPIC['pos']
df_EPIC['CpGs'] = df_EPIC['probeID']
df_EPIC.head()

Unnamed: 0,CpG_chrm,CpG_beg,CpG_end,probe_strand,probeID,address_A,address_B,channel,designType,nextBase,nextBaseRef,probeType,orientation,probeCpGcnt,context35,probeBeg,probeEnd,ProbeSeq_A,ProbeSeq_B,gene,gene_HGNC,chrm_A,beg_A,flag_A,mapQ_A,cigar_A,NM_A,chrm_B,beg_B,flag_B,mapQ_B,cigar_B,NM_B,wDecoy_chrm_A,wDecoy_beg_A,wDecoy_flag_A,wDecoy_mapQ_A,wDecoy_cigar_A,wDecoy_NM_A,wDecoy_chrm_B,wDecoy_beg_B,wDecoy_flag_B,wDecoy_mapQ_B,wDecoy_cigar_B,wDecoy_NM_B,posMatch,MASK_mapping,MASK_typeINextBaseSwitch,MASK_rmsk15,MASK_sub40_copy,MASK_sub35_copy,MASK_sub30_copy,MASK_sub25_copy,MASK_snp5_common,MASK_snp5_GMAF1p,MASK_extBase,MASK_general,pos,posID,CpGs
0,chr1,10524.0,10526.0,-,cg14817997,21611527,,Both,II,G/A,C,cg,down,3.0,4.0,10526,10575.0,AAACRAAACTACRTTATCCTCTACACAAATTTCRATAATACTCTAA...,,,,chr1,10526.0,16,0,50M,0.0,,,,,,,chr1,10526.0,16,0,50M,0.0,,,,,,,,True,False,True,True,True,True,True,False,False,False,True,10525,chr1:10525,cg14817997
1,chr1,10847.0,10849.0,+,cg26928153,91693541,47784201.0,Grn,I,C,G,cg,up,7.0,12.0,10800,10849.0,ACACATACTAACACATCAAAATAAAAACATAACACAAACACAAAAA...,ACACATACTAACGCGTCGAAATAAAAACGTAACGCAAACGCAAAAA...,DDX11L1,DDX11L1,chr1,10800.0,0,22,50M,0.0,chr1,10800.0,0.0,22.0,50M,0.0,chr1,10800.0,0,20,50M,0.0,chr1,10800.0,0.0,20.0,50M,0.0,,True,False,True,False,False,True,True,False,False,False,True,10848,chr1:10848,cg26928153
2,chr1,10849.0,10851.0,+,cg16269199,82663207,3701821.0,Grn,I,C,G,cg,up,8.0,12.0,10802,10851.0,ACATACTAACACATCAAAATAAAAACATAACACAAACACAAAAAAA...,ACATACTAACGCGTCGAAATAAAAACGTAACGCAAACGCAAAAAAA...,DDX11L1,DDX11L1,chr1,10802.0,0,22,50M,0.0,chr1,10802.0,0.0,22.0,50M,0.0,chr1,10802.0,0,20,50M,0.0,chr1,10802.0,0.0,20.0,50M,0.0,,True,False,True,False,True,True,True,False,False,False,True,10850,chr1:10850,cg16269199
3,chr1,15864.0,15866.0,-,cg13869341,2665852,39757192.0,Red,I,A,C,cg,down,2.0,4.0,15865,15914.0,CCAATAACTAACCACTCTACTAAAATCCATCCACCAAACTAAAAAC...,CCGATAACTAACCACTCTACTAAAATCCATCCGCCAAACTAAAAAC...,WASH7P,WASH7P,chr1,15865.0,16,17,50M,0.0,chr1,15865.0,16.0,17.0,50M,0.0,chr1,15865.0,16,15,50M,0.0,chr1,15865.0,16.0,15.0,50M,0.0,,True,False,False,True,True,True,True,False,False,False,True,15865,chr1:15865,cg13869341
4,chr1,18826.0,18828.0,-,cg14008030,84794291,,Both,II,G/A,C,cg,down,2.0,3.0,18828,18877.0,ACTCRAAATTTACTCAATAAACCRTTCAATATATACAAAAACAATT...,,MIR6859-3;WASH7P,MIR6859-3;WASH7P,chr1,18828.0,16,1,50M,0.0,,,,,,,chr1,18828.0,16,0,50M,0.0,,,,,,,,True,False,False,False,False,False,False,False,False,False,True,18827,chr1:18827,cg14008030


In [5]:
df_ref_pos = df_ref.copy()
print("original shape : ", df_ref_pos.shape)

df_ref_pos['CpGs'] = pd.merge(df_ref_pos, df_EPIC, on='CpGs', how='left')['posID']
df_ref_pos = df_ref_pos.dropna()
print("after conversion shape : ", df_ref_pos.shape)

df_ref_pos.to_csv('reference_atlas.pos.csv', index=None)

original shape :  (7890, 26)
after conversion shape :  (7887, 26)


In [6]:
df_ref_pos.head()

Unnamed: 0,CpGs,Monocytes_EPIC,B-cells_EPIC,CD4T-cells_EPIC,NK-cells_EPIC,CD8T-cells_EPIC,Neutrophils_EPIC,Erythrocyte_progenitors,Adipocytes,Cortical_neurons,Hepatocytes,Lung_cells,Pancreatic_beta_cells,Pancreatic_acinar_cells,Pancreatic_duct_cells,Vascular_endothelial_cells,Colon_epithelial_cells,Left_atrium,Bladder,Breast,Head_and_neck_larynx,Kidney,Prostate,Thyroid,Upper_GI,Uterus_cervix
0,chr14:68790171,0.8866,0.2615,0.0149,0.0777,0.0164,0.868,0.9509,0.0336,0.0168,0.034,0.0416,0.038875,0.0209,0.013,0.0323,0.0163,0.0386,0.0462,0.0264,0.047,0.0269,0.0353,0.0553,0.0701,0.0344
1,chr15:90184328,0.8363,0.221,0.2816,0.4705,0.3961,0.8293,0.2385,0.3578,0.3104,0.2389,0.225,0.132,0.2249,0.1996,0.3654,0.2037,0.2446,0.2054,0.1922,0.2045,0.1596,0.1557,0.1848,0.168,0.2026
2,chr15:90184338,0.7658,0.0222,0.1492,0.4005,0.3474,0.7915,0.1374,0.1965,0.0978,0.0338,0.0768,0.041725,0.0314,0.0139,0.2382,0.0193,0.1134,0.1269,0.1651,0.1523,0.1034,0.0686,0.0943,0.1298,0.1075
3,chr11:64877015,0.8861,0.5116,0.1021,0.4363,0.0875,0.7042,0.9447,0.0842,0.2832,0.2259,0.0544,0.11175,0.0309,0.0217,0.0972,0.0187,0.0674,0.0769,0.0691,0.0704,0.0604,0.0369,0.0412,0.0924,0.0697
4,chr14:68790074,0.5212,0.3614,0.0227,0.212,0.0225,0.5368,0.4667,0.0287,0.1368,0.0307,0.1607,0.065975,0.037,0.023,0.0798,0.0193,0.0432,0.0459,0.0228,0.0687,0.0234,0.0508,0.0726,0.0759,0.0196


In [7]:
df_example_pos = df_example.copy()
print("original shape : ", df_example_pos.shape)

df_example_pos['CpGs'] = pd.merge(df_example_pos, df_EPIC, on='CpGs', how='left')['posID']
df_example_pos = df_example_pos.dropna()
print("after conversion shape : ", df_example_pos.shape)

df_example_pos.to_csv('examples.pos.csv', index=None)

original shape :  (7890, 4)
after conversion shape :  (7887, 4)


In [8]:
df_example_pos.head()

Unnamed: 0,CpGs,Monocytes90_Breast10,Adipo60Kidney30Prost10,Eryth50Lung25GI25
0,chr14:68790171,0.80058,0.03176,0.503375
1,chr15:90184328,0.77189,0.27813,0.2175
2,chr15:90184338,0.70573,0.15578,0.12035
3,chr11:64877015,0.8044,0.07233,0.50905
4,chr14:68790074,0.47136,0.02932,0.2925


In [9]:
! python deconvolve.py --wgbs -a reference_atlas.pos.csv examples.pos.csv

Monocytes90_Breast10: 6102 sites
Adipo60Kidney30Prost10: 6102 sites
Eryth50Lung25GI25: 6102 sites
