# Creating a Functional Annotation Back-end Database from a Truncated Chromosome 22 File

In order to save memory whilst using Python and Pandas, the following code uses a smaller version of the Chr22 CADD file. The purpose of this code is primarily to prepare for when I use the full Chr22 dataset and finally create the database for the entire genome.

The "Truncated" chromosome 22 file has been shortened to 3 million rows and 4 columns. All other functional annotation files merged have not been compromised in length or size.

In [1]:
# Check Working Directory.

import os
import sys
print("wd"
     , os.getcwd())

wd /Users/patrickhallaert


In [2]:
# Set Working Directory to Hard Drive, which contains all the needed files.

os.chdir('/Volumes/HZU/CADD/hg19')
print("wd"
     , os.getcwd())

wd /Volumes/HZU/CADD/hg19


In [3]:
# Import Pandas and NumPy

import pandas as pd
import numpy as np

In [4]:
# We run the separator as ‘\t’ because in this file the tab character separates the fields.

ch22 = pd.read_csv("chr22F1-43m.txt",sep='\t', dtype = 'str')

In [5]:
# We can quickly view the data to make sure there are no issues and everything looks normal.

ch22.head()

In [6]:
print(ch22.columns.tolist())

# This first row will cause a KeyError as left_join will be unable to recognize the correct headers.

In [7]:
# Thus, we remove the "Title" Row (through the command line) to Prevent a future KeyError while Left_Joining CADD chr22 with a functional annotation dataset.

ch22nh = pd.read_csv("/Volumes/HZU/CADD/hg19/chr22F1-43mnh.txt",sep='\t', dtype ={"#Chrom": "int8"})# iterator=True, chunksize=1000)
ch22nh['Pos'] = ch22nh.Pos.astype('category')
ch22nh['Ref'] = ch22nh.Ref.astype('category')
ch22nh['Alt'] = ch22nh.Alt.astype('category')

In [8]:
# Now, we can import the ClinVar data.

clinvar = pd.read_csv('clinvar_20220528.txt', comment='#', sep='\t', dtype={0: "str", "#CHROM": "int8"})

# Additional bits (comment='#', sep='\t') needed to make the vcf file "readable" to Pandas left_join.

In [9]:
# Let's take a look:

clinvar.head()

Unnamed: 0,1,861332,1019397,G,A,.,..1,"ALLELEID=1003021;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.861332G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=1640863258"
0,1,861336,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...
1,1,861349,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...
2,1,861356,1362713,T,C,.,.,ALLELEID=1396033;CLNDISDB=MedGen:CN517202;CLND...
3,1,861366,1568423,C,T,.,.,ALLELEID=1570515;CLNDISDB=MedGen:CN517202;CLND...
4,1,861383,1365270,C,T,.,.,ALLELEID=1502313;CLNDISDB=MedGen:CN517202;CLND...


In [10]:
# Let's add headers.

clinvar.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
clinvar['ALT'] = clinvar.ALT.astype('category')
clinvar['QUAL'] = clinvar.QUAL.astype('category')
clinvar['FILTER'] = clinvar.FILTER.astype('category')

In [11]:
# Let's take another look:

clinvar.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,861336,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...
1,1,861349,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...
2,1,861356,1362713,T,C,.,.,ALLELEID=1396033;CLNDISDB=MedGen:CN517202;CLND...
3,1,861366,1568423,C,T,.,.,ALLELEID=1570515;CLNDISDB=MedGen:CN517202;CLND...
4,1,861383,1365270,C,T,.,.,ALLELEID=1502313;CLNDISDB=MedGen:CN517202;CLND...


In [12]:
clinvar.memory_usage(deep=True).sum()

875637202

In [13]:
clinvar.memory_usage(deep=True)

Index           128
#CHROM     85466112
POS        11694024
ID         11694024
REF        90355155
ALT         5899642
QUAL        1461919
FILTER      1461919
INFO      667604279
dtype: int64

In [14]:
# Now, we repeat the process for the Eigen dataset.

eigen = pd.read_csv("/Volumes/HZU/everyheaders_hg19_Eigen22.txt",sep='\t', on_bad_lines='skip', dtype={11: "str", 12: "str", 13: "str", 14: "str", 15: "str", 16: "str", "#CHROM": "int8"})
eigen.head()

Unnamed: 0,#chr,position,position.1,ref,alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,...,PhyloPla,PhyloVer,PhastPri,PhastPla,PhastVe,Consequence,Eigen-raw,Eigen-phred,Eigen-PC-raw,Eigen-PC-phred
0,22,16157306,16157306,T,A,.,.,.,.,0.511,...,0.476,0.153,0.015,0.019,0.043,"intron_variant,non_coding_transcript_variant",-0.723507,0.773534,-0.954353,0.547572
1,22,16157306,16157306,T,C,.,.,.,.,0.511,...,0.476,0.153,0.015,0.019,0.043,"intron_variant,non_coding_transcript_variant",-0.723507,0.773534,-0.954353,0.547572
2,22,16157306,16157306,T,G,.,.,.,.,0.511,...,0.476,0.153,0.015,0.019,0.043,"intron_variant,non_coding_transcript_variant",-0.723507,0.773534,-0.954353,0.547572
3,22,16157307,16157307,C,A,.,.,.,.,0.511,...,0.569,0.68,0.016,0.022,0.049,"intron_variant,non_coding_transcript_variant",-0.690256,0.829126,-0.919775,0.593548
4,22,16157307,16157307,C,G,.,.,.,.,0.511,...,0.569,0.68,0.016,0.022,0.049,"intron_variant,non_coding_transcript_variant",-0.690256,0.829126,-0.919775,0.593548


In [15]:
clinvar.memory_usage(deep=True)

Index           128
#CHROM     85466112
POS        11694024
ID         11694024
REF        90355155
ALT         5899642
QUAL        1461919
FILTER      1461919
INFO      667604279
dtype: int64

In [16]:
# Finally, the dbnsfp33a annotations:
    
dbnsfp33a = pd.read_csv("/Volumes/HZU/humandb/everyheader_hg19_dbnsfp33a22.txt",sep='\t', on_bad_lines='skip' , dtype='str')
dbnsfp33a.head()

Unnamed: 0,#chr,start,end,ref,alt,SIFT_score,SIFT_converted_rankscore,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_rankscore,...,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V6_gene,GTEx_V6_tissue
0,22,16287549,16287549,G,A,.,.,.,.,.,...,0.247,0.001,0.137,0.0,0.016,.,.,.,.,.
1,22,16287549,16287549,G,C,.,.,.,.,.,...,0.247,0.001,0.137,0.0,0.016,.,.,.,.,.
2,22,16287549,16287549,G,T,.,.,.,.,.,...,0.247,0.001,0.137,0.0,0.016,.,.,.,.,.
3,22,16287550,16287550,C,A,.,.,.,.,.,...,0.001,0.0,0.063,0.0,0.016,.,.,.,.,.
4,22,16287550,16287550,C,G,.,.,.,.,.,...,0.001,0.0,0.063,0.0,0.016,.,.,.,.,.


In [17]:
ch22eigen = pd.merge(ch22nh, eigen, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'position', 'ref', 'alt'], how="left")
ch22eigen_s = ch22eigen.astype("Sparse[str]")
ch22eigen_s = ch22eigen_s.drop(columns=['#chr', "position", "position.1", "ref", "alt"])
del ch22eigen
del eigen
del ch22nh
import gc
gc.collect()

0

In [18]:
ch22_eigen_dbn = pd.merge(ch22eigen_s, dbnsfp33a, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22eigen_dbn_s = ch22_eigen_dbn.astype("Sparse[str]")
del ch22_eigen_dbn
del dbnsfp33a
gc.collect()

0

In [19]:
ch22_eigen_dbn_clinvar = pd.merge(ch22eigen_dbn_s, clinvar, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#CHROM', 'POS', 'REF', 'ALT'], how="left")
ch22_eigen_dbn_clinvar_s = ch22_eigen_dbn_clinvar.astype("Sparse[str]")
ch22_eigen_dbn_clinvar_s = ch22_eigen_dbn_clinvar_s.drop(columns=['#CHROM', "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
del ch22_eigen_dbn_clinvar
del clinvar
gc.collect()

0

In [20]:
c_fathmm = pd.read_csv("/Volumes/HZU/humandb/hg19_fathmm_xf_codingch22headers.txt",sep='\t', dtype = 'str')
c_fathmm = c_fathmm.drop(columns="position")
c_fathmm

Unnamed: 0,#Chrom,Pos,Ref,Alt,FATHMM_XF_coding
0,22,16258189,G,A,0.006646
1,22,16258189,G,C,0.066183
2,22,16258189,G,T,0.078170
3,22,16258190,A,C,0.039113
4,22,16258190,A,G,0.040414
...,...,...,...,...,...
2232697,22,51220721,A,G,0.023020
2232698,22,51220721,A,T,0.034400
2232699,22,51220722,T,A,0.034870
2232700,22,51220722,T,C,0.033790


In [21]:
ch22_eigen_dbn_clinvar_cfathmm = pd.merge(ch22_eigen_dbn_clinvar_s, c_fathmm, on =['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_s = ch22_eigen_dbn_clinvar_cfathmm.astype("Sparse[str]")
del ch22_eigen_dbn_clinvar_cfathmm
del c_fathmm
gc.collect()

0

In [22]:
exomegnomad = pd.read_csv("/Volumes/HZU/humandb/headers_hg19_gnomad211_exomech22.txt",sep='\t', dtype = 'str')
exomegnomad.head()

Unnamed: 0,#chr,start,end,ref,alt,AF,AF_popmax,AF_male,AF_female,AF_raw,...,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax
0,22,16157263,16157263,C,T,0.0054,0.0132,0.0043,0.0068,0.0043,...,0.0,0.0081,0.0,0.0,0,0.0294,0.0132,0.0132,0.0132,0.0172
1,22,16157264,16157264,G,A,0.0703,0.1233,0.0721,0.068,0.0242,...,0.1233,0.0,0.1042,0.0714,0,0.1154,0.1244,0.1293,0.1233,0.1272
2,22,16157277,16157277,G,A,0.0014,0.0035,0.0019,0.0008,0.0006,...,0.0035,0.0026,0.0,0.0,0,0.0,0.0035,0.0037,0.0035,0.0024
3,22,16157293,16157293,G,C,0.0014,0.0041,0.0017,0.001,0.0009,...,0.0041,0.0,0.0008,0.0,0,0.0,0.0042,0.0045,0.0041,0.0031
4,22,16157302,16157302,T,C,0.0,.,0.0,0.0,2.028e-05,...,0.0,0.0,0.0,0.0,0,0.0,.,.,.,.


In [23]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_s, exomegnomad, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad.astype("Sparse[str]")
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad
del exomegnomad
gc.collect()

0

In [24]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [25]:
abraom = pd.read_csv("/Volumes/HZU/humandb/headers_hg19_abraom_ch22.txt",sep='\t', dtype = 'str')
abraom

Unnamed: 0,#chr,start,end,ref,alt,abraom_freq,abraom_filter,abraom_cegh_filter
0,22,16256078,16256078,G,A,0.018072,LowQual,FAB
1,22,16256352,16256352,T,C,0.404762,VQSRTrancheSNP99.90to100.00,WK-LowCall
2,22,16256430,16256430,A,G,0.325581,VQSRTrancheSNP99.90to100.00,WK-LowCall
3,22,16256484,16256484,T,C,0.078431,VQSRTrancheSNP99.90to100.00,FAB
4,22,16256512,16256512,T,C,0.391081,VQSRTrancheSNP99.00to99.90,WK-LowCall
...,...,...,...,...,...,...,...,...
60421,22,51237712,51237712,G,A,0.131250,VQSRTrancheSNP99.00to99.90,FAB
60422,22,51237766,51237766,T,C,0.016393,VQSRTrancheSNP99.00to99.90,FAB
60423,22,51238130,51238130,G,A,0.272727,VQSRTrancheSNP99.00to99.90,FDP
60424,22,51238249,51238249,A,C,0.063492,VQSRTrancheSNP99.00to99.90,FDP


In [26]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_s, abraom, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.astype("Sparse[str]")
del abraom
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom
gc.collect()

0

In [27]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_s.drop(columns=['alt_x', "#chr_x", "#chr", "start", "start_x", "ref_x", "ref", "#chr", "alt"])

In [28]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,end,abraom_freq,abraom_filter,abraom_cegh_filter
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [29]:
AFR = pd.read_csv('/Volumes/HZU/humandb/hg19_AFR.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
ALL = pd.read_csv('/Volumes/HZU/humandb/hg19_ALL.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
AMR = pd.read_csv('/Volumes/HZU/humandb/hg19_AMR.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
AMR = pd.read_csv('/Volumes/HZU/humandb/hg19_AMR.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
EAS = pd.read_csv('/Volumes/HZU/humandb/hg19_EAS.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
AFR.columns = ["#chr", "start", "ref", "alt", "AFR_exome_allele_frequency", "rsID"]
ALL.columns = ["#chr", "start", "ref", "alt", "ALL_exome_allele_frequency", "rsID"]
AMR.columns = ["#chr", "start", "ref", "alt", "AMR_exome_allele_frequency", "rsID"]
EAS.columns = ["#chr", "start", "ref", "alt", "EAS_exome_allele_frequency", "rsID"]

In [30]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_s, ALL, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.drop(columns=['#chr', "start", "ref", "alt", "rsID"])
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.astype("Sparse[str]")
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL
del ALL
gc.collect()

0

In [31]:
gc.collect()
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_s, AFR, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR.drop(columns=['#chr', "start", "ref", "alt", "rsID"])
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR.astype("Sparse[str]")
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR
del AFR
gc.collect()

0

In [32]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_s, EAS, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS.drop(columns=['#chr', "start", "ref", "alt", "rsID"])
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS.astype("Sparse[str]")
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS
del EAS
gc.collect()

0

In [33]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_s, AMR, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR.drop(columns=['#chr', "start", "ref", "alt", "rsID"])
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR_s = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR.astype("Sparse[str]")
del AMR
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR
gc.collect()

0

In [34]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,non_cancer_AF_popmax,controls_AF_popmax,end,abraom_freq,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,EAS_exome_allele_frequency,AMR_exome_allele_frequency
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [35]:
import psutil
process = psutil.Process(os.getpid())
print(process.memory_info().rss)  # in bytes 

# Thank god for https://pythonspeed.com/articles/pandas-load-less-data/


6181896192


In [36]:
cg46 = pd.read_csv("/Volumes/HZU/humandb/hg19_cg46.txt", sep='\t', dtype = 'str')
cg46.head()

Unnamed: 0,1,38232,38232.1,A,G,0.065
0,1,41218,41218,T,A,0.022
1,1,41256,41256,C,T,0.022
2,1,41981,41981,A,G,0.783
3,1,42577,42577,C,T,0.022
4,1,43586,43586,C,A,0.022


In [37]:
cg46.columns = ["#Chrom", "position", "Position2", "Ref", "Alt", "cg46"]

In [38]:
cg46_added = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR_s, cg46, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chrom', 'position', 'Ref', 'Alt'], how="left")
cg46_added = cg46_added.drop(columns=['Position2', 'position', 'end'])
cg46_added_s = cg46_added.astype("Sparse[str]")
del ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR_s
del cg46_added
del cg46
gc.collect()

0

In [39]:
cg46_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,non_cancer_AF_popmax,controls_AF_popmax,abraom_freq,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,EAS_exome_allele_frequency,AMR_exome_allele_frequency,cg46
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [None]:
cg46_added_s['cg46'].isnull().sum()

#There are a few matches! 

In [41]:
cosmic70 = pd.read_csv("/Volumes/HZU/humandb/hg19_cosmic70.txt", sep='\t', dtype = 'str')
cosmic70.columns = ["#Chrom", "Position", "Position2", "Ref", "Alt", "Cosmic70"]

In [42]:
cosmic70

Unnamed: 0,#Chrom,Position,Position2,Ref,Alt,Cosmic70
0,1,69523,69523,G,T,ID=COSM426644;OCCURENCE=1(breast)
1,1,69538,69538,G,A,ID=COSM75742;OCCURENCE=1(ovary)
2,1,69539,69539,T,C,ID=COSM1343690;OCCURENCE=1(large_intestine)
3,1,69540,69540,G,T,ID=COSM1560546;OCCURENCE=1(large_intestine)
4,1,69569,69569,T,C,ID=COSM1599955;OCCURENCE=2(central_nervous_sys...
...,...,...,...,...,...,...
1303997,Y,58890978,58890978,T,C,ID=COSN168846;OCCURENCE=1(large_intestine)
1303998,Y,58892738,58892738,C,T,ID=COSN145817;OCCURENCE=2(haematopoietic_and_l...
1303999,Y,58913094,58913094,C,A,ID=COSN150113;OCCURENCE=1(stomach)
1304000,Y,58917586,58917586,C,T,ID=COSN400223;OCCURENCE=1(lung)


In [43]:
cosmic_added = pd.merge(cg46_added_s, cosmic70, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chrom', 'Position', 'Ref', 'Alt'], how="left")
cosmic_added = cosmic_added.drop(columns=['Position2', 'Position'])
cosmic_added_s = cosmic_added.astype("Sparse[str]")
del cg46_added_s
del cosmic_added
del cosmic70
gc.collect()

0

In [44]:
cosmic_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,controls_AF_popmax,abraom_freq,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,EAS_exome_allele_frequency,AMR_exome_allele_frequency,cg46,Cosmic70
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [45]:
col_list1 = ["#Chr", "Start", "Ref", "Alt", "dbscSNV_ADA_SCORE", "dbscSNV_RF_SCORE"]
dbscsnv11 = pd.read_csv("/Volumes/HZU/humandb/hg19_dbscsnv11.txt", usecols=col_list1, sep='\t', dtype = 'str')

# Might have to splice this ^ if we have memory issues down the line

In [46]:
dbscsnv11

Unnamed: 0,#Chr,Start,Ref,Alt,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE
0,1,860326,A,C,0.0076,0.03
1,1,860326,A,G,0.0076,0.032
2,1,860326,A,T,0.0069,0.03
3,1,860327,A,C,0.0043,0.04
4,1,860327,A,G,0.0043,0.04
...,...,...,...,...,...,...
15030430,Y,27198377,G,C,0,0
15030431,Y,27198377,G,T,0,0
15030432,Y,27198378,G,A,0,0
15030433,Y,27198378,G,C,0,0


In [49]:
dbscsnv11_added = pd.merge(cosmic_added_s, dbscsnv11, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', 'Ref', 'Alt'], how="left")
dbscsnv11_added = dbscsnv11_added.drop(columns=['#Chr', 'Start'])
dbscsnv11_added_s = dbscsnv11_added.astype("Sparse[str]")
del cosmic_added_s
del dbscsnv11_added
del dbscsnv11
gc.collect()

845

In [50]:
dbscsnv11_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,EAS_exome_allele_frequency,AMR_exome_allele_frequency,cg46,Cosmic70,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [51]:
col_list2 = ["#Chr", "Start", "Ref", "Alt", "ExAC_ALL", "ExAC_AFR", "ExAC_AMR", "ExAC_EAS", "ExAC_FIN", "ExAC_NFE", "ExAC_OTH", "ExAC_SAS"]
exac03 = pd.read_csv("/Volumes/HZU/humandb/hg19_exac03.txt", usecols=col_list2, sep='\t', dtype = 'str')

In [52]:
exac03

Unnamed: 0,#Chr,Start,Ref,Alt,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS
0,1,13372,G,C,0.0002,0,0,0,0,0,0,0.0004
1,1,13380,C,G,0.0033,0.0381,0.0096,0,0,0,0,0
2,1,13382,C,G,0.0002,0,0,0,0,0,0,0.0003
3,1,13402,G,C,0,0,0,0,.,0,0,0
4,1,13404,G,A,0,0,0,0,.,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10883564,Y,27000985,C,T,0,.,.,.,.,.,.,.
10883565,Y,27001073,T,C,0,0,0,.,0,0,0,0
10883566,Y,27001093,T,C,0,0,0,.,0,0,0,0
10883567,Y,27190049,T,G,0,0,0,0,0,0,0,0


In [53]:
exac03_added = pd.merge(dbscsnv11_added_s, exac03, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', 'Ref', 'Alt'], how="left")
exac03_added_s = exac03_added.astype("Sparse[str]")
del exac03_added
del exac03
del dbscsnv11_added_s
gc.collect()

0

In [54]:
exac03_added_s = exac03_added_s.drop(columns=["#Chr", "Start"])

In [55]:
exac03_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [56]:
EUR = pd.read_csv("/Volumes/HZU/humandb/ hg19_EUR.sites.2015_08_ch22.txt", sep='\t', dtype = 'str')

In [57]:
EUR.columns = ["#CHROM", "POS", "REF", "ALT", "EUR_exome_allele_frequency", "rsID"]
EUR

Unnamed: 0,#CHROM,POS,REF,ALT,EUR_exome_allele_frequency,rsID
0,22,16050527,C,A,0.001,rs587769434
1,22,16050607,G,A,0.004,rs587720402
2,22,16050654,A,<CN2>,0.007,esv3647175;esv3647176;esv3647177;esv3647178
3,22,16050654,A,<CN3>,0.0944,esv3647175;esv3647176;esv3647177;esv3647178
4,22,16050654,A,<CN4>,0.003,esv3647175;esv3647176;esv3647177;esv3647178
...,...,...,...,...,...,...
363874,22,51239794,C,A,0.003,rs561893765
363875,22,51240084,G,C,0.005,rs529322970
363876,22,51240820,C,T,0.0179,rs202228854
363877,22,51241386,C,G,0.0099,rs568168135


In [58]:
EUR_added = pd.merge(exac03_added_s, EUR, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#CHROM', 'POS', 'REF', 'ALT'], how="left")
EUR_added_s = EUR_added.astype("Sparse[str]")
del exac03_added_s
del EUR_added
del EUR
gc.collect()

0

In [59]:
EUR_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,#CHROM,POS,REF,ALT,EUR_exome_allele_frequency,rsID
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [60]:
EUR_added_s = EUR_added_s.drop(columns=['#CHROM', "POS", "REF", "ALT"])

In [61]:
GallSNP = pd.read_csv("/Volumes/HZU/humandb/hg19_GeuvadisFDR5allSNP.txt", sep='\t', dtype = 'str')

In [62]:
GallSNP

Unnamed: 0,#chr,pos,pos.1,ref,alt,Geuvadis_eQTL_target_gene
0,chr1,126108,126108,G,A,ENSG00000228794
1,chr1,662622,662622,G,A,ENSG00000228794
2,chr1,693731,693731,A,G,ENSG00000228794
3,chr1,707886,707886,G,C,ENSG00000228794
4,chr1,712871,712871,T,C,ENSG00000177757
...,...,...,...,...,...,...
658811,chr22,51222100,51222100,G,T,ENSG00000079974
658812,chr22,51222728,51222728,C,T,ENSG00000079974
658813,chr22,51228259,51228259,A,G,ENSG00000079974
658814,chr22,51229805,51229805,T,C,ENSG00000079974


In [63]:
GallSNP_added = pd.merge(EUR_added_s, GallSNP, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'pos', 'ref', 'alt'], how="left")
GallSNP_added_s = GallSNP_added.astype("Sparse[str]")
del EUR_added_s
del GallSNP
del GallSNP_added
gc.collect()

0

In [64]:
GallSNP_added_s = GallSNP_added_s.drop(columns=['pos', "pos.1", "ref", "alt", "rsID", "#chr"])

In [65]:
GallSNP_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,EUR_exome_allele_frequency,Geuvadis_eQTL_target_gene
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [66]:
gme = pd.read_csv("/Volumes/HZU/humandb/hg19_gme.txt", sep='\t', dtype = 'str')

In [67]:
gme

Unnamed: 0,#Chr,Start,End,Ref,Alt,GME_AF,GME_NWA,GME_NEA,GME_AP,GME_Israel,GME_SD,GME_TP,GME_CA
0,1,69134,69134,A,G,0.049505,0.000000,0.032787,0.000000,0,0.000000,0.181818,0.133333
1,1,69270,69270,A,G,0.698113,0.642857,0.564246,0.677778,0.666667,0.863636,0.848214,0.868852
2,1,69428,69428,T,G,0.050340,0.044444,0.053097,0.066327,0.000000,0.076923,0.040541,0.037037
3,1,69438,69438,T,C,0.000675,0.000000,0.001462,0.000000,0.000000,0.000000,0.004386,0.000000
4,1,69453,69453,G,A,0.008880,0.000000,0.010234,0.010101,0.000000,0.010000,0.004630,0.009091
...,...,...,...,...,...,...,...,...,...,...,...,...,...
700116,X,155239821,155239824,GCAA,G,0.095160,0.103704,0.070727,0.134921,0.071429,0.135802,0.067873,0.117021
700117,X,155239822,155239824,CAA,-,0.095160,0.103704,0.070727,0.134921,0.071429,0.135802,0.067873,0.117021
700118,X,155239824,155239824,A,G,0.398336,0.311111,0.284615,0.432836,0.285714,0.407407,0.312217,0.405263
700119,X,155239827,155239827,A,G,0.364717,0.279412,0.263056,0.371212,0.285714,0.382716,0.303167,0.393617


In [68]:
gme_added = pd.merge(GallSNP_added_s, gme, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', 'Ref', 'Alt'], how="left")
gme_added_s = gme_added.astype("Sparse[str]")
del GallSNP_added_s
del gme
del gme_added
gc.collect()

0

In [69]:
gme_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,Start,End,GME_AF,GME_NWA,GME_NEA,GME_AP,GME_Israel,GME_SD,GME_TP,GME_CA
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [70]:
hrcr1 = pd.read_csv("/Volumes/HZU/humandb/hg19_hrcr1_ch22.txt", sep='\t', dtype = 'str')
hrcr1.columns = ["#Chr", "Start", "End", "Ref", "Alt", "HRC_AF", "HRC_AC", "HRC_AN", "HRC_non1000G_AF", "HRC_non1000G_AC", "HRC_non1000G_AN"]
hrcr1 = hrcr1.drop(columns=["End"])
hrcr1

Unnamed: 0,#Chr,Start,Ref,Alt,HRC_AF,HRC_AC,HRC_AN,HRC_non1000G_AF,HRC_non1000G_AC,HRC_non1000G_AN
0,22,16050783,A,G,0.000323196,21,64976,0,0,59986
1,22,16050822,G,A,0.137482,8933,64976,0.136665,8198,59986
2,22,16050922,T,G,0.000169293,11,64976,0,0,59986
3,22,16050984,C,G,0.000230854,15,64976,0,0,59986
4,22,16051099,G,T,0.000169293,11,64976,0.000183376,11,59986
...,...,...,...,...,...,...,...,...,...,...
524725,22,51238513,C,G,0.000138513,9,64976,0.000150035,9,59986
524726,22,51238660,A,G,0.000769515,50,64976,1.66706e-05,1,59986
524727,22,51239586,T,G,0.00243167,158,64976,0.00261728,157,59986
524728,22,51239678,G,T,0.00146208,95,64976,0.000150035,9,59986


In [71]:
hrcr1_added = pd.merge(gme_added_s, hrcr1, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', "Ref", 'Alt'], how="left")
hrcr1_added_s = hrcr1_added.astype("Sparse[str]")
del gme_added_s
del hrcr1
del hrcr1_added
gc.collect()

0

In [72]:
hrcr1_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,GME_TP,GME_CA,#Chr_y,Start_y,HRC_AF,HRC_AC,HRC_AN,HRC_non1000G_AF,HRC_non1000G_AC,HRC_non1000G_AN
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [73]:
hrcr1_added_s = hrcr1_added_s.drop(columns=['#Chr_x', '#Chr_y', 'Start_y', 'Start_x'])

In [74]:
hrcr1_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,GME_Israel,GME_SD,GME_TP,GME_CA,HRC_AF,HRC_AC,HRC_AN,HRC_non1000G_AF,HRC_non1000G_AC,HRC_non1000G_AN
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [75]:
icgc21 = pd.read_csv("/Volumes/HZU/humandb/hg19_icgc21_ch22.txt", sep='\t', dtype = 'str')
icgc21.columns = ["#Chr", "Start", "End", "Ref", "Alt", "ICGC_Id", "ICGC_Occurrence"]
icgc21

Unnamed: 0,#Chr,Start,End,Ref,Alt,ICGC_Id,ICGC_Occurrence
0,22,16050159,16050159,C,T,MU44319285,COCA-CN|1|187|0.00535
1,22,16050229,16050229,C,A,MU55653016,LUSC-KR|1|66|0.01515
2,22,16050344,16050344,C,T,MU62645550,MELA-AU|1|183|0.00546
3,22,16050396,16050396,G,-,MU3110893,LIRI-JP|1|260|0.00385
4,22,16050436,16050436,C,T,MU60492919,MELA-AU|1|183|0.00546
...,...,...,...,...,...,...,...
388974,22,51242245,51242245,C,T,MU63355116,LUSC-KR|1|66|0.01515
388975,22,51242573,51242573,T,A,MU55316918,MELA-AU|1|183|0.00546
388976,22,51242620,51242620,A,G,MU39367842,SKCA-BR|1|66|0.01515
388977,22,51242640,51242640,A,G,MU56954816,MELA-AU|1|183|0.00546


In [76]:
icgc21_added = pd.merge(hrcr1_added_s, icgc21, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', "Ref", 'Alt'], how="left")
icgc21_added_s = icgc21_added.astype("Sparse[str]")
del hrcr1_added_s
del icgc21
del icgc21_added
gc.collect()

0

In [77]:
icgc21_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,HRC_AC,HRC_AN,HRC_non1000G_AF,HRC_non1000G_AC,HRC_non1000G_AN,#Chr,Start,End_y,ICGC_Id,ICGC_Occurrence
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [78]:
icgc21_added_s = icgc21_added_s.drop(columns=['#Chr', "Start", "End_y"])

In [79]:
intervar = pd.read_csv("/Volumes/HZU/humandb/hg19_intervar_20180118_ch22.txt", sep='\t', dtype = 'str')
intervar

Unnamed: 0,22,16258186,16258186.1,T,A,Uncertain significance,0,0.1,0.2,0.3,...,0.15,0.16,0.17,0.18,0.19,0.20,0.21,0.22,0.23,0.24
0,22,16258186,16258186,T,C,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22,16258186,16258186,T,G,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,16258187,16258187,C,A,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22,16258187,16258187,C,G,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,22,16258187,16258187,C,T,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2130096,22,51220721,51220721,A,G,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2130097,22,51220721,51220721,A,T,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2130098,22,51220722,51220722,T,A,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2130099,22,51220722,51220722,T,C,Uncertain significance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
intervar.columns = ["#Chr", "Start", "End", "Ref", "Alt", "InterVar_automated", "PVS1", "PS1", "PS2", "PS3", "PS4", "PM1", "PM2", "PM3", "PM4", "PM5", "PM6", "PP1", "PP2", "PP3", "PP4", "PP5", "BA1", "BS1", "BS2", "BS3", "BS4", "BP1", "BP2", "BP3", "BP4", "BP5", "BP6", "BP7"]

In [81]:
intervar_added = pd.merge(icgc21_added_s, intervar, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', "Ref", 'Alt'], how="left")
intervar_added_s = intervar_added.astype("Sparse[str]")
intervar_added_s = intervar_added_s.drop(columns=["End_x"])

del icgc21_added_s
del intervar
del intervar_added
gc.collect()

0

In [82]:
intervar_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,BS2,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [83]:
nci60 = pd.read_csv("/Volumes/HZU/humandb/hg19_nci60.txt", sep='\t', dtype = 'str')
nci60.columns = ["#Chr", "Start", "End", "Ref", "Alt", "nci60"]

In [84]:
nci60_added = pd.merge(intervar_added_s, nci60, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', "Ref", 'Alt'], how="left")
nci60_added = nci60_added.drop(columns=["#Chr_y", "Start_y", "End_y", '#Chr_x', "Start_x"])
nci60_added_s = nci60_added.astype("Sparse[str]")

del intervar_added_s
del nci60
del nci60_added
gc.collect()

0

In [85]:
nci60_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,nci60
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [86]:
kaviar = pd.read_csv("/Volumes/HZU/humandb/hg19_kaviar_20150923_ch22.txt", sep='\t', dtype = 'str')
kaviar.columns = ["#Chr", "Start", "End", "Ref", "Alt", "Kaviar_AF", "Kaviar_AC", "Kaviar_AN"]
kaviar

Unnamed: 0,#Chr,Start,End,Ref,Alt,Kaviar_AF,Kaviar_AC,Kaviar_AN
0,22,16050075,16050075,A,G,3.84e-05,1,26028
1,22,16050086,16050086,G,A,3.84e-05,1,26028
2,22,16050115,16050115,G,A,0.0014215,37,26028
3,22,16050116,16050116,G,C,0.0017673,46,26028
4,22,16050154,16050154,T,C,3.84e-05,1,26028
...,...,...,...,...,...,...,...,...
3586137,22,51244411,51244411,C,G,0,0,26028
3586138,22,51244424,51244424,T,C,0,0,26028
3586139,22,51244443,51244443,C,G,3.84e-05,1,26028
3586140,22,51244515,51244515,C,G,3.84e-05,1,26028


In [87]:
kaviar_added = pd.merge(nci60_added_s, kaviar, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#Chr', 'Start', "Ref", 'Alt'], how="left")
kaviar_added_s = kaviar_added.astype("Sparse[str]")
del nci60_added_s
del kaviar
del kaviar_added
gc.collect()

0

In [88]:
kaviar_added_s = kaviar_added_s.drop(columns=["#Chr", "Start", "End"])
kaviar_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,BP2,BP3,BP4,BP5,BP6,BP7,nci60,Kaviar_AF,Kaviar_AC,Kaviar_AN
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [89]:
lfathmm = pd.read_csv("/Volumes/HZU/humandb/hg19_ljb23_fathmm_ch22.txt", sep='\t', dtype = 'str')
lfathmm.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "LJB23_FATHMM_score", "LJB23_FATHMM_score_converted", "LJB23_FATHMM_pred"]
lfathmm = lfathmm.drop(columns=["Pos2"])


In [90]:
lfathmm_added = pd.merge(kaviar_added_s, lfathmm, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
lfathmm_added_s = lfathmm_added.astype("Sparse[str]")
del kaviar_added_s
del lfathmm
del lfathmm_added
gc.collect()

0

In [91]:
lfathmm_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,BP5,BP6,BP7,nci60,Kaviar_AF,Kaviar_AC,Kaviar_AN,LJB23_FATHMM_score,LJB23_FATHMM_score_converted,LJB23_FATHMM_pred
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [92]:
phylop = pd.read_csv("/Volumes/HZU/humandb/hg19_ljb23_phylop_ch22.txt", sep='\t', dtype = 'str')
phylop.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "LJB23_PhyloP"]
phylop = phylop.drop(columns=["Pos2"])

In [93]:
phylop_added = pd.merge(lfathmm_added_s, phylop, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
phylop_added_s = phylop_added.astype("Sparse[str]")
del lfathmm_added_s
del phylop
del phylop_added
gc.collect()

0

In [94]:
phylop_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,BP6,BP7,nci60,Kaviar_AF,Kaviar_AC,Kaviar_AN,LJB23_FATHMM_score,LJB23_FATHMM_score_converted,LJB23_FATHMM_pred,LJB23_PhyloP
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [95]:
sift = pd.read_csv("/Volumes/HZU/humandb/hg19_ljb23_sift_ch22.txt", sep='\t', dtype = 'str')
sift.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "LJB23_SIFT_score", "LJB23_SIFT_score_converted", "LJB23_SIFT_pred"]
sift = sift.drop(columns=["Pos2"])

In [96]:
sift_added = pd.merge(phylop_added_s, sift, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
sift_added_s = sift_added.astype("Sparse[str]")
del phylop_added_s
del sift
del sift_added
gc.collect()

0

In [97]:
sift_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,Kaviar_AF,Kaviar_AC,Kaviar_AN,LJB23_FATHMM_score,LJB23_FATHMM_score_converted,LJB23_FATHMM_pred,LJB23_PhyloP,LJB23_SIFT_score,LJB23_SIFT_score_converted,LJB23_SIFT_pred
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [98]:
siphy = pd.read_csv("/Volumes/HZU/humandb/hg19_ljb23_siphy_ch22.txt", sep='\t', dtype = 'str')
siphy.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "LJB23_SiPhy"]
siphy = siphy.drop(columns=["Pos2"])

In [99]:
siphy_added = pd.merge(sift_added_s, siphy, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
siphy_added_s = siphy_added.astype("Sparse[str]")
del sift_added_s
del siphy
del siphy_added
gc.collect()

0

In [100]:
siphy_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,Kaviar_AC,Kaviar_AN,LJB23_FATHMM_score,LJB23_FATHMM_score_converted,LJB23_FATHMM_pred,LJB23_PhyloP,LJB23_SIFT_score,LJB23_SIFT_score_converted,LJB23_SIFT_pred,LJB23_SiPhy
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [101]:
mcap = pd.read_csv("/Volumes/HZU/humandb/hg19_mcap_ch22.txt", sep='\t', dtype = 'str')
mcap.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "MCAP_score"]
mcap = mcap.drop(columns=["Pos2"])

In [102]:
mcap_added = pd.merge(siphy_added_s, mcap, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
mcap_added_s = mcap_added.astype("Sparse[str]")
del siphy_added_s
del mcap
del mcap_added
gc.collect()

0

In [103]:
mcap_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,Kaviar_AN,LJB23_FATHMM_score,LJB23_FATHMM_score_converted,LJB23_FATHMM_pred,LJB23_PhyloP,LJB23_SIFT_score,LJB23_SIFT_score_converted,LJB23_SIFT_pred,LJB23_SiPhy,MCAP_score
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [104]:
popfreqall = pd.read_csv("/Volumes/HZU/humandb/hg19_popfreq_all_20150413_ch22.txt", sep='\t', dtype = 'str')
popfreqall.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "PopFreqMax", "1000G_ALL", "1000G_AFR", "1000G_AMR", "1000G_EAS", "1000G_EUR", "1000G_SAS", "ExAC_AL", "ExAC_AFR", "ExAC_AMR", "ExAC_EAS", "ExAC_FIN", "ExAC_NFE", "ExAC_OTE", "ExAC_SAS", "ESP6500siv2_ALL", "ESP6500siv2_AA", "ESP6500siv2_EA", "CG46"]
popfreqall = popfreqall.drop(columns=["Pos2"])

In [105]:
popfreqall_added = pd.merge(mcap_added_s, popfreqall, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
popfreqall_added_s = popfreqall_added.astype("Sparse[str]")
del mcap_added_s
del popfreqall
del popfreqall_added
gc.collect()

0

In [106]:
popfreqall_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ExAC_AMR_y,ExAC_EAS_y,ExAC_FIN_y,ExAC_NFE_y,ExAC_OTE,ExAC_SAS_y,ESP6500siv2_ALL,ESP6500siv2_AA,ESP6500siv2_EA,CG46
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [107]:
regsnpintron = pd.read_csv("/Volumes/HZU/humandb/hg19_regsnpintron_ch22.txt", sep='\t', dtype = 'str')
regsnpintron.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "regsnp_fpr", "regsnp_disease", "regsnp_splicing_site"]
regsnpintron = regsnpintron.drop(columns=["Pos2"])

In [108]:
regsnpintron_added = pd.merge(popfreqall_added_s, regsnpintron, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
regsnpintron_added_s = regsnpintron_added.astype("Sparse[str]")
del popfreqall_added_s
del regsnpintron
del regsnpintron_added
gc.collect()

0

In [109]:
regsnpintron_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ExAC_NFE_y,ExAC_OTE,ExAC_SAS_y,ESP6500siv2_ALL,ESP6500siv2_AA,ESP6500siv2_EA,CG46,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [110]:
revel = pd.read_csv("/Volumes/HZU/humandb/hg19_revel_ch22.txt", sep='\t', dtype = 'str')
revel.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "REVEL"]
revel = revel.drop(columns=["Pos2"])

In [111]:
revel_added = pd.merge(regsnpintron_added_s, revel, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
revel_added_s = revel_added.astype("Sparse[str]")
del regsnpintron_added_s
del revel
del revel_added
gc.collect()

0

In [112]:
revel_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ExAC_OTE,ExAC_SAS_y,ESP6500siv2_ALL,ESP6500siv2_AA,ESP6500siv2_EA,CG46,regsnp_fpr,regsnp_disease,regsnp_splicing_site,REVEL
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [113]:
sas = pd.read_csv("/Volumes/HZU/humandb/hg19_SAS.sites.2015_08_ch22.txt", sep='\t', dtype = 'str')
sas.columns = ["#Chrom", "Pos", "Ref", "Alt", "SAS", "rsID"]
sas = sas.drop(columns=["rsID"])

In [114]:
sas_added = pd.merge(revel_added_s, sas, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
sas_added_s = sas_added.astype("Sparse[str]")
del revel_added_s
del sas
del sas_added
gc.collect()

0

In [115]:
sas_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ExAC_SAS_y,ESP6500siv2_ALL,ESP6500siv2_AA,ESP6500siv2_EA,CG46,regsnp_fpr,regsnp_disease,regsnp_splicing_site,REVEL,SAS
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [116]:
interpro = pd.read_csv("/Volumes/HZU/humandb/hg19_dbnsfp31a_interpro_ch22.txt", sep='\t', dtype = 'str')
interpro.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "Interpro_domain"]
interpro = interpro.drop(columns=["Pos2"])

In [118]:
interpro_added = pd.merge(sas_added_s, interpro, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
interpro_added_s = interpro_added.astype("Sparse[str]")
del sas_added_s
del interpro
del interpro_added
gc.collect()

821

In [119]:
interpro_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,ESP6500siv2_ALL,ESP6500siv2_AA,ESP6500siv2_EA,CG46,regsnp_fpr,regsnp_disease,regsnp_splicing_site,REVEL,SAS,Interpro_domain_y
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [120]:
aa = pd.read_csv("/Volumes/HZU/humandb/hg19_esp6500siv2_aa_ch22.txt", sep='\t', dtype = 'str')
aa.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "esp6500_aa", "rsID"]
aa = aa.drop(columns=["Pos2", "rsID"])

espall = pd.read_csv("/Volumes/HZU/humandb/hg19_esp6500siv2_all_ch22.txt", sep='\t', dtype = 'str')
espall.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "esp6500_all", "rsID"]
espall = espall.drop(columns=["Pos2", "rsID"])

ea = pd.read_csv("/Volumes/HZU/humandb/hg19_esp6500siv2_ea_ch22.txt", sep='\t', dtype = 'str')
ea.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "esp6500_ea", "rsID"]
ea = ea.drop(columns=["Pos2", "rsID"])

In [121]:
aa_added = pd.merge(interpro_added_s, aa, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
aa_added_s = aa_added.astype("Sparse[str]")
del interpro_added_s
del aa
del aa_added
gc.collect()

0

In [122]:
espall_added = pd.merge(aa_added_s, espall, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
espall_added_s = espall_added.astype("Sparse[str]")
del aa_added_s
del espall
del espall_added
gc.collect()

0

In [123]:
ea_added = pd.merge(espall_added_s, ea, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
ea_added_s = ea_added.astype("Sparse[str]")
del espall_added_s
del ea
del ea_added
gc.collect()

0

In [124]:
ea_added_s

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,CG46,regsnp_fpr,regsnp_disease,regsnp_splicing_site,REVEL,SAS,Interpro_domain_y,esp6500_aa,esp6500_all,esp6500_ea
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [125]:
av = pd.read_csv("/Volumes/HZU/humandb/hg19_avsnp150_ch22.txt", sep='\t', dtype = 'str')
av.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "avsnp150"]
av = av.drop(columns=["Pos2"])

In [126]:
av_added = pd.merge(ea_added_s, av, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
av_added_s = av_added.astype("Sparse[str]")
del ea_added_s
del av
del av_added
gc.collect()

0

In [None]:
dann = pd.read_csv("/Volumes/HZU/humandb/hg19_dann_ch22.txt", sep='\t', dtype = 'str')
dann.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "DANN"]
dann = dann.drop(columns=["Pos2"])

In [None]:
dann_added = pd.merge(av_added_s, dann, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
dann_added_s = dann_added.astype("Sparse[str]")
del av_added_s
del dann
del dann_added
gc.collect()

In [None]:
ncfathmm = pd.read_csv("/Volumes/HZU/humandb/hg19_fathmm_xf_noncoding_ch22.txt", sep='\t', dtype = 'str')
ncfathmm.columns = ["#Chrom", "Pos", "Pos2", "Ref", "Alt", "fathmm_xf_noncoding"]
ncfathmm = ncfathmm.drop(columns=["Pos2"])

In [None]:
ncfathmm_added = pd.merge(dann_added_s, ncfathmm, on=['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
Final_Chr22_Dataframe = ncfathmm_added.astype("Sparse[str]")
del dann_added_s
del ncfathmm
del ncfathmm_added
gc.collect()

In [None]:
Final_Chr22_Dataframe