# Convert VCFs to Pandas DataFrames and save to .pkl format. Convert snpAD-called heterozygotes to homozygous calls

In [1]:
import utils.VcfHandler as VcfHandler

hg19_mapped = VcfHandler.convert_vcf_to_dataframe('data/input_data/hg19_mapped/data_full_merged.hg19_mapped.map35_100.GCcov.noSimpleRepeat.capture_full.vcf.gz', convert_hets=True)
hg19_mapped.to_pickle('data/input_data/data_full_merged.hg19_mapped.vcf.gz.pkl')

a0b_mapped = VcfHandler.convert_vcf_to_dataframe('data/input_data/A0b_mapped/data_full_merged.A0b_mapped.map35_100.GCcov.noSimpleRepeat.vcf.gz', convert_hets=True)
a0b_mapped.to_pickle('data/input_data/data_full_merged.A0b_mapped.vcf.gz.pkl')

t2t_mapped = VcfHandler.convert_vcf_to_dataframe('data/input_data/T2T_mapped/data_full_merged.T2T_mapped.map35_100.GCcov.noSimpleRepeat.vcf.gz', convert_hets=True)
t2t_mapped.to_pickle('data/input_data/data_full_merged.T2T_mapped.vcf.gz.pkl')

chimp
HG02666
HG02982
HGDP00056
HGDP00057
HGDP00103
HGDP00127
HGDP00136
HGDP00213
HGDP00218
HGDP00453
HGDP00475
HGDP01200
HGDP01214
HGDP01225
HGDP01298
HGDP01406
HGDP00478
HGDP00549
HGDP00757
HGDP00908
HGDP00931
HGDP00984
HGDP01009
HGDP01029
HGDP01031
HGDP01190
HGDP01192
A00
A00_I10871
Mezmaiskaya2
Chagyrskaya2
Ust_Ishim
Yana1
Loschbour
HG02666
HG02982
HGDP00056
HGDP00057
HGDP00103
HGDP00127
HGDP00136
HGDP00213
HGDP00218
HGDP00453
HGDP00475
HGDP01200
HGDP01214
HGDP01225
HGDP01298
HGDP01406
HGDP00478
HGDP00549
HGDP00757
HGDP00908
HGDP00931
HGDP00984
HGDP01009
HGDP01029
HGDP01031
HGDP01190
HGDP01192
A00
A00_I10871
Mezmaiskaya2
Chagyrskaya2
Ust_Ishim
Yana1
Loschbour
chimp
A00_I10871
HG02666
HG02982
HGDP00056
HGDP00057
HGDP00103
HGDP00127
HGDP00136
HGDP00213
HGDP00218
HGDP00453
HGDP00475
HGDP00478
HGDP00549
HGDP00757
HGDP00908
HGDP00931
HGDP00984
HGDP01009
HGDP01029
HGDP01031
HGDP01190
HGDP01192
HGDP01200
HGDP01214
HGDP01225
HGDP01298
HGDP01406
A00_I10871
HG02666
HG02982
HGDP00056
HGDP0005

# Generate coverage table (based on hg19_mapped VCF)

In [1]:
import pandas as pd
import utils.VcfHandler as VcfHandler
import utils.YChrDataset as YChrDataset

hg19_mapped = pd.read_pickle('data/input_data/data_full_merged.hg19_mapped.vcf.gz.pkl')

cov = []
for ind in YChrDataset.samples:
    coverage = hg19_mapped[ind, 'DP'].mean()
    cov.append([ind, round(coverage, 1)])

df = pd.DataFrame(cov, columns=['Individual', 'Coverage'])

with pd.option_context('display.precision', 1):
    display(df)

Unnamed: 0,Individual,Coverage
0,Chagyrskaya2,46.8
1,Mezmaiskaya2,15.8
2,A00_I10871,8.5
3,A00,17.2
4,HG02982,14.3
5,HG02666,15.0
6,HGDP01029,15.2
7,HGDP01406,14.4
8,HGDP00931,15.8
9,HGDP00478,16.5


# Create fasta files from VCFs using only variable sites

In [None]:
import pandas as pd
import utils.VcfHandler as VcfHandler
import utils.YChrDataset as YChrDataset
import os

if not os.path.exists('data/output_data'):
    os.makedirs('data/output_data')

hg19_mapped = pd.read_pickle('data/input_data/data_full_merged.hg19_mapped.vcf.gz.pkl')
hg19_mapped = VcfHandler.get_genotypes_from_vcf_dataframe(hg19_mapped)

VcfHandler.convert_vcf_dataframe_to_fasta_file(hg19_mapped, 'data/output_data/data_full_merged.hg19_mapped.fa', variant_sites_only=True, order=['chimp'] + YChrDataset.samples)

a0b_mapped = pd.read_pickle('data/input_data/data_full_merged.A0b_mapped.vcf.gz.pkl')
a0b_mapped = VcfHandler.get_genotypes_from_vcf_dataframe(a0b_mapped)

VcfHandler.convert_vcf_dataframe_to_fasta_file(a0b_mapped, 'data/output_data/data_full_merged.A0b_mapped.fa', variant_sites_only=True)

t2t_mapped = pd.read_pickle('data/input_data/data_full_merged.T2T_mapped.vcf.gz.pkl')
t2t_mapped = VcfHandler.get_genotypes_from_vcf_dataframe(t2t_mapped)

VcfHandler.convert_vcf_dataframe_to_fasta_file(t2t_mapped, 'data/output_data/data_full_merged.T2T_mapped.fa', variant_sites_only=True)