In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

from genomegenie.io import to_df, to_arrow, to_dask_df

In [19]:
pd.set_option("display.max_columns", 100)

In [2]:
vfname1 = "variant_calls/Sample_1_MuSE.vcf"
vfname2 = "variant_calls/Sample_1_Mutect2.vcf"
vfname3a = "variant_calls/Sample_1_Strelka.somatic.indels.vcf"
vfname3b = "variant_calls/Sample_1_Strelka.somatic.snvs.vcf"

In [5]:
ddf3a = to_dask_df(to_df(vfname3a, fields="*", samples=[0,1]))
ddf3b = to_dask_df(to_df(vfname3b, fields="*", samples=[0,1]))

In [6]:
ddf3a.columns, ddf3b.columns

(Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'numalt', 'altlen', 'is_snp',
        'QUAL', 'FILTER_HighDepth', 'FILTER_LowDepth', 'FILTER_LowEVS',
        'FILTER_PASS', 'IC', 'IHP', 'MQ', 'MQ0', 'NT', 'OVERLAP', 'QSI',
        'QSI_NT', 'RC', 'RU', 'SGT', 'SOMATIC', 'SomaticEVS', 'TQSI', 'TQSI_NT',
        'NORMAL_BCN50', 'TUMOR_BCN50', 'NORMAL_DP', 'TUMOR_DP', 'NORMAL_DP2',
        'TUMOR_DP2', 'NORMAL_DP50', 'TUMOR_DP50', 'NORMAL_FDP50', 'TUMOR_FDP50',
        'NORMAL_SUBDP50', 'TUMOR_SUBDP50', 'NORMAL_TAR', 'TUMOR_TAR',
        'NORMAL_TIR', 'TUMOR_TIR', 'NORMAL_TOR', 'TUMOR_TOR'],
       dtype='object'),
 Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'numalt', 'altlen', 'is_snp',
        'QUAL', 'FILTER_LowDepth', 'FILTER_LowEVS', 'FILTER_PASS', 'DP', 'MQ',
        'MQ0', 'NT', 'PNOISE', 'PNOISE2', 'QSS', 'QSS_NT', 'ReadPosRankSum',
        'SGT', 'SNVSB', 'SOMATIC', 'SomaticEVS', 'TQSS', 'TQSS_NT', 'NORMAL_AU',
        'TUMOR_AU', 'NORMAL_CU', 'TUMOR_CU', 'NORMAL_DP', 'TUMOR_DP',
   

In [7]:
set(ddf3a.columns).intersection(ddf3b.columns)

{'ALT',
 'CHROM',
 'FILTER_LowDepth',
 'FILTER_LowEVS',
 'FILTER_PASS',
 'ID',
 'MQ',
 'MQ0',
 'NORMAL_DP',
 'NT',
 'POS',
 'QUAL',
 'REF',
 'SGT',
 'SOMATIC',
 'SomaticEVS',
 'TUMOR_DP',
 'altlen',
 'is_snp',
 'numalt'}

In [14]:
common = ['CHROM', 'POS', 'ID', 'REF', 'numalt', 'QUAL', 'is_snp']

In [25]:
ddf3_1 = dd.merge(ddf3a, ddf3b, how="outer", on=common, suffixes=("_indel", "_snv"), indicator=True)

In [26]:
df3_1 = ddf3_1.compute()
df3_1.head(10)

Unnamed: 0,CHROM,POS,ID,REF,ALT_indel,numalt,altlen_indel,is_snp,QUAL,FILTER_HighDepth,FILTER_LowDepth_indel,FILTER_LowEVS_indel,FILTER_PASS_indel,IC,IHP,MQ_indel,MQ0_indel,NT_indel,OVERLAP,QSI,QSI_NT,RC,RU,SGT_indel,SOMATIC_indel,SomaticEVS_indel,TQSI,TQSI_NT,NORMAL_BCN50,TUMOR_BCN50,NORMAL_DP_indel,TUMOR_DP_indel,NORMAL_DP2,TUMOR_DP2,NORMAL_DP50,TUMOR_DP50,NORMAL_FDP50,TUMOR_FDP50,NORMAL_SUBDP50,TUMOR_SUBDP50,NORMAL_TAR,TUMOR_TAR,NORMAL_TIR,TUMOR_TIR,NORMAL_TOR,TUMOR_TOR,ALT_snv,altlen_snv,FILTER_LowDepth_snv,FILTER_LowEVS_snv,FILTER_PASS_snv,DP,MQ_snv,MQ0_snv,NT_snv,PNOISE,PNOISE2,QSS,QSS_NT,ReadPosRankSum,SGT_snv,SNVSB,SOMATIC_snv,SomaticEVS_snv,TQSS,TQSS_NT,NORMAL_AU,TUMOR_AU,NORMAL_CU,TUMOR_CU,NORMAL_DP_snv,TUMOR_DP_snv,NORMAL_FDP,TUMOR_FDP,NORMAL_GU,TUMOR_GU,NORMAL_SDP,TUMOR_SDP,NORMAL_SUBDP,TUMOR_SUBDP,NORMAL_TU,TUMOR_TU,_merge
0,chr1,1326471,.,GA,"(G, , )",1,"(-1, 0, 0)",False,,False,False,False,True,10.0,12.0,60.0,0.0,ref,False,31.0,31.0,11.0,A,ref->het,True,9.07,1.0,1.0,0.0,0.0,17.0,19.0,17.0,19.0,20.639999,22.469999,0.0,0.0,0.0,0.0,"(16, 16)","(14, 14)","(0, 0)","(4, 4)","(1, 1)","(1, 1)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
1,chr1,12853972,.,A,"(ACATG, , )",1,"(4, 0, 0)",False,,True,False,True,False,1.0,2.0,50.470001,0.0,ref,False,27.0,27.0,0.0,CATG,ref->het,True,0.71,2.0,2.0,0.0,0.0,37.0,60.0,37.0,60.0,38.25,55.830002,0.22,0.31,0.0,0.0,"(32, 33)","(41, 41)","(2, 2)","(9, 11)","(1, 0)","(10, 8)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
2,chr1,12891096,.,AG,"(A, , )",1,"(-1, 0, 0)",False,,True,False,True,False,0.0,10.0,57.380001,0.0,ref,False,1.0,1.0,1.0,G,ref->ref,True,0.05,1.0,1.0,0.0,0.0,56.0,42.0,56.0,42.0,51.580002,40.849998,0.27,0.0,0.0,0.0,"(50, 50)","(36, 36)","(6, 6)","(4, 4)","(0, 0)","(3, 3)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
3,chr1,16891120,.,AATG,"(A, , )",1,"(-3, 0, 0)",False,,True,False,False,False,0.0,3.0,42.0,0.0,ref,False,44.0,44.0,1.0,ATG,ref->het,True,11.4,1.0,1.0,0.0,0.0,40.0,42.0,40.0,42.0,43.400002,44.389999,0.0,0.0,0.0,0.0,"(42, 42)","(37, 37)","(0, 0)","(4, 4)","(0, 0)","(3, 3)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
4,chr1,16944723,.,C,"(CA, , )",1,"(1, 0, 0)",False,,False,False,True,False,3.0,2.0,60.0,0.0,ref,False,1.0,1.0,2.0,A,het->het,True,0.1,1.0,1.0,0.0,0.0,18.0,26.0,18.0,26.0,22.02,30.35,0.0,0.0,0.0,0.0,"(15, 15)","(19, 19)","(3, 3)","(4, 4)","(0, 0)","(4, 4)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
5,chr1,16975912,.,CCCTGTT,"(C, , )",1,"(-6, 0, 0)",False,,False,False,True,False,0.0,4.0,41.77,0.0,ref,False,1.0,1.0,1.0,CCTGTT,het->het,True,0.05,1.0,1.0,0.0,0.0,10.0,40.0,10.0,40.0,11.96,42.209999,0.0,0.0,0.0,0.0,"(9, 9)","(33, 33)","(2, 2)","(7, 7)","(0, 0)","(4, 4)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
6,chr1,17354373,.,GGAA,"(G, , )",1,"(-3, 0, 0)",False,,True,False,True,False,7.0,4.0,60.0,0.0,ref,False,33.0,33.0,8.0,GAA,ref->het,True,3.94,1.0,1.0,0.0,0.0,50.0,43.0,50.0,43.0,49.189999,39.099998,0.0,0.14,0.0,0.0,"(32, 32)","(25, 25)","(1, 1)","(6, 6)","(15, 15)","(12, 12)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
7,chr1,21133496,.,CT,"(C, , )",1,"(-1, 0, 0)",False,,True,False,True,False,12.0,13.0,60.130001,0.0,ref,False,4.0,4.0,13.0,T,ref->het,True,0.22,1.0,1.0,0.0,0.0,55.0,31.0,55.0,31.0,58.310001,31.360001,0.0,0.0,0.0,0.0,"(41, 41)","(20, 20)","(6, 6)","(4, 4)","(8, 8)","(6, 6)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
8,chr1,28149666,.,CTTG,"(C, , )",1,"(-3, 0, 0)",False,,True,False,True,False,6.0,4.0,60.0,0.0,ref,False,24.0,24.0,7.0,TTG,ref->het,True,3.32,1.0,1.0,0.0,0.0,46.0,36.0,46.0,36.0,46.48,32.02,0.0,0.0,0.0,0.0,"(31, 31)","(19, 19)","(0, 0)","(3, 3)","(16, 16)","(16, 16)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
9,chr1,31732677,.,GA,"(G, , )",1,"(-1, 0, 0)",False,,False,False,True,False,10.0,16.0,60.0,0.0,ref,False,1.0,1.0,11.0,A,het->het,True,1.11,1.0,1.0,0.0,0.0,15.0,20.0,15.0,20.0,15.75,18.27,0.0,0.21,0.0,0.0,"(12, 12)","(14, 14)","(1, 1)","(5, 5)","(2, 2)","(1, 1)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only


In [27]:
df3_1.sample(50)

Unnamed: 0,CHROM,POS,ID,REF,ALT_indel,numalt,altlen_indel,is_snp,QUAL,FILTER_HighDepth,FILTER_LowDepth_indel,FILTER_LowEVS_indel,FILTER_PASS_indel,IC,IHP,MQ_indel,MQ0_indel,NT_indel,OVERLAP,QSI,QSI_NT,RC,RU,SGT_indel,SOMATIC_indel,SomaticEVS_indel,TQSI,TQSI_NT,NORMAL_BCN50,TUMOR_BCN50,NORMAL_DP_indel,TUMOR_DP_indel,NORMAL_DP2,TUMOR_DP2,NORMAL_DP50,TUMOR_DP50,NORMAL_FDP50,TUMOR_FDP50,NORMAL_SUBDP50,TUMOR_SUBDP50,NORMAL_TAR,TUMOR_TAR,NORMAL_TIR,TUMOR_TIR,NORMAL_TOR,TUMOR_TOR,ALT_snv,altlen_snv,FILTER_LowDepth_snv,FILTER_LowEVS_snv,FILTER_PASS_snv,DP,MQ_snv,MQ0_snv,NT_snv,PNOISE,PNOISE2,QSS,QSS_NT,ReadPosRankSum,SGT_snv,SNVSB,SOMATIC_snv,SomaticEVS_snv,TQSS,TQSS_NT,NORMAL_AU,TUMOR_AU,NORMAL_CU,TUMOR_CU,NORMAL_DP_snv,TUMOR_DP_snv,NORMAL_FDP,TUMOR_FDP,NORMAL_GU,TUMOR_GU,NORMAL_SDP,TUMOR_SDP,NORMAL_SUBDP,TUMOR_SUBDP,NORMAL_TU,TUMOR_TU,_merge
14301,chr9,67302979,.,C,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(A, , )","(0, 0, 0)",False,True,False,58.0,50.639999,0.0,het,,,1.0,1.0,0.22,AC->AC,0.0,True,0.0,1.0,1.0,"(9, 9)","(3, 3)","(21, 21)","(25, 25)",30.0,28.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(0, 0)","(0, 0)",right_only
17105,chr11,64323539,.,C,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(A, , )","(0, 0, 0)",False,True,False,30.0,60.0,0.0,ref,,,2.0,2.0,0.13,CC->CC,0.0,True,0.98,1.0,1.0,"(0, 0)","(2, 2)","(10, 10)","(18, 18)",10.0,20.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(0, 0)","(0, 0)",right_only
24316,chr17,40556145,.,C,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(A, , )","(0, 0, 0)",False,True,False,67.0,60.0,0.0,ref,,,1.0,1.0,-0.29,CC->CC,0.0,True,0.94,1.0,1.0,"(0, 0)","(2, 2)","(33, 33)","(32, 32)",33.0,34.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(0, 0)","(0, 0)",right_only
28484,chr22,32113099,.,C,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(A, , )","(0, 0, 0)",False,True,False,64.0,60.0,0.0,ref,,,1.0,1.0,1.2,CC->CC,0.93,True,0.81,1.0,1.0,"(0, 0)","(2, 2)","(35, 35)","(27, 27)",35.0,29.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(0, 0)","(0, 0)",right_only
9109,chr5,68716255,.,T,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(C, , )","(0, 0, 0)",False,True,False,95.0,60.0,0.0,ref,,,1.0,1.0,-0.51,TT->TT,0.0,True,1.07,1.0,1.0,"(0, 0)","(0, 0)","(0, 0)","(2, 2)",61.0,34.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(61, 61)","(32, 32)",right_only
929,chr1,16894612,.,T,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(G, , )","(0, 0, 0)",False,True,False,317.0,50.400002,0.0,ref,,,60.0,60.0,0.1,TT->GT,0.0,True,3.92,1.0,1.0,"(0, 0)","(0, 0)","(0, 0)","(0, 0)",154.0,163.0,0.0,0.0,"(1, 1)","(8, 8)",0.0,0.0,0.0,0.0,"(153, 153)","(155, 155)",right_only
5684,chr2,223183575,.,G,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(T, , )","(0, 0, 0)",False,True,False,4.0,60.0,0.0,ref,,,2.0,2.0,0.0,GG->GG,0.0,True,1.38,1.0,1.0,"(0, 0)","(0, 0)","(0, 0)","(0, 0)",2.0,2.0,0.0,0.0,"(2, 2)","(0, 0)",0.0,0.0,0.0,0.0,"(0, 0)","(2, 2)",right_only
25284,chr18,31663104,.,A,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(G, , )","(0, 0, 0)",True,True,False,2.0,60.0,0.0,ref,,,1.0,1.0,0.0,AA->AA,0.0,True,0.79,1.0,1.0,"(1, 1)","(0, 0)","(0, 0)","(0, 0)",1.0,1.0,0.0,0.0,"(0, 0)","(1, 1)",0.0,0.0,0.0,0.0,"(0, 0)","(0, 0)",right_only
10716,chr6,54636234,.,C,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(A, , )","(0, 0, 0)",False,True,False,44.0,60.0,0.0,ref,,,2.0,2.0,0.38,CC->CC,0.0,True,0.92,1.0,1.0,"(0, 0)","(2, 2)","(21, 21)","(21, 21)",21.0,23.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(0, 0)","(0, 0)",right_only
19532,chr13,19756073,.,T,,1,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(C, , )","(0, 0, 0)",False,True,False,8.0,60.0,0.0,ref,,,2.0,2.0,-0.5,TT->TT,0.0,True,1.5,1.0,1.0,"(0, 0)","(0, 0)","(0, 0)","(6, 6)",1.0,7.0,0.0,0.0,"(0, 0)","(0, 0)",0.0,0.0,0.0,0.0,"(1, 1)","(1, 1)",right_only
