# Introduction

convert a vcf to a matrix with (rsid, individual)
 
1. filter CHROM X, Y, MT and allels with MAF 0.05
2. read genotypes and replace genotypes with NaN, 0, 1, 2 
3. filter individuals that have missingness more than 20%

In [21]:
import os
import json
import shlex, subprocess
import pandas as pd
import numpy as np

# Filter chromosome and allel frequency

In [23]:
# filter CHROM X, Y, MT
# filter MAF 0.05
vcf_file = os.path.join('user_data', 'lactose_intolerant.vcf.gz')
args = [
    'vcftools', '--gzvcf', vcf_file, '--not-chr X', '--not-chr Y',
    '--not-chr MT', '--maf 0.05', '--max-missing 0.8', '--max-alleles 2',
    '--recode', '--out lactose_intolerant'
]
command_line = ' '.join(args)
print(command_line)
# takes long time about 10min
subprocess.call(command_line, shell=True)

vcftools --gzvcf user_data/lactose_intolerant.vcf.gz --not-chr X --not-chr Y --not-chr MT --maf 0.05 --max-missing 0.8 --max-alleles 2 --recode --out lactose_intolerant


# Load VCF

In [24]:
# read genotype header
with open(os.path.join('user_data', 'lactose_intolerant.recode.vcf'),
          'r') as f:
    skip_num = 92
    for i in range(skip_num):
        f.readline()
    line = f.readline()

    col_names = line[1:].split()
print(col_names)

['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', '1005', '1024', '1028', '1029', '1036', '1039', '1040', '1060', '1071', '1075', '1077', '1079', '13:1079', '1081', '1091', '10', '17:10', '1103', '1114', '1123', '1147', '1165', '1173', '1190', '1193', '11', '123', '125', '1269', '1288', '1319', '1341', '1379', '139', '13', '1409', '1412', '1449', '1459', '1465', '146', '1499', '14', '1502', '1537', '1543', '1548', '1566', '159', '160', '161', '52:161', '1626', '1627', '1634', '1641', '16', '1712', '1718', '1742', '1759', '1764', '1777', '64:1777', '1778', '17', '1848', '187', '1887', '1952', '71:1952', '1975', '1987', '1996', '1', '76:1', '2008', '2023', '2024', '202', '2035', '2050', '2052', '2055', '2056', '2067', '2075', '2076', '207', '2090', '2155', '2159', '2164', '2215', '2237', '2238', '2249', '2288', '99:2288', '100:2288', '2293', '2297', '22', '2307', '2356', '2362', '2385', '2455', '2456', '2498', '2557', '2575', '262', '2635', '2645', '2660', '26', '2

In [25]:
vcf_genotype = pd.read_csv(
    os.path.join('user_data', 'lactose_intolerant.recode.vcf'),
    sep='\t',
    header=None,
    names=col_names,
    comment='#')

vcf_genotype

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,1005,...,916,918,920,924,926,943,954,972,990,99
0,1,752721,rs3131972,A,G,.,.,.,GT,./.,...,0/1,0/1,1/1,0/1,1/1,0/0,1/1,1/1,1/1,1/1
1,1,776546,rs12124819,A,G,.,.,.,GT,./.,...,0/1,0/1,0/0,0/0,0/1,0/0,0/1,0/0,1/1,0/1
2,1,798959,rs11240777,G,A,.,.,.,GT,./.,...,1/0,1/0,0/0,0/0,1/0,1/0,1/0,1/0,0/0,0/0
3,1,838555,rs4970383,C,A,.,.,.,GT,./.,...,0/0,0/0,1/0,0/0,0/0,1/0,0/0,0/0,0/0,1/0
4,1,846808,rs4475691,C,T,.,.,.,GT,./.,...,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454810,22,51171693,rs756638,G,A,.,.,.,GT,./.,...,1/0,0/0,1/0,0/0,0/0,./.,1/0,0/0,1/0,1/0
454811,22,51175626,rs3810648,A,G,.,.,.,GT,./.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1
454812,22,51178090,rs2285395,G,A,.,.,.,GT,./.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/0,1/0
454813,22,51181759,rs13056621;exm2268446,A,G,.,.,.,GT,./.,...,1/1,0/1,1/1,0/1,0/1,0/0,0/1,1/1,1/1,1/1


# Replace genotype and fitler individuals

In [26]:
# replace genotype with integers
vcf_genotype = vcf_genotype.replace({
    './.': np.NaN,
    '0/0': 0,
    '0/1': 1,
    '1/0': 1,
    '1/1': 2
})

vcf_genotype

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,1005,...,916,918,920,924,926,943,954,972,990,99
0,1,752721,rs3131972,A,G,.,.,.,GT,,...,1.0,1.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0
1,1,776546,rs12124819,A,G,.,.,.,GT,,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0
2,1,798959,rs11240777,G,A,.,.,.,GT,,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,1,838555,rs4970383,C,A,.,.,.,GT,,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1,846808,rs4475691,C,T,.,.,.,GT,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454810,22,51171693,rs756638,G,A,.,.,.,GT,,...,1.0,0.0,1.0,0.0,0.0,,1.0,0.0,1.0,1.0
454811,22,51175626,rs3810648,A,G,.,.,.,GT,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
454812,22,51178090,rs2285395,G,A,.,.,.,GT,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
454813,22,51181759,rs13056621;exm2268446,A,G,.,.,.,GT,,...,2.0,1.0,2.0,1.0,1.0,0.0,1.0,2.0,2.0,2.0


In [27]:
# count percentage of NaN per indiviudal
Num_NaN_indivudual = vcf_genotype.isna().sum(axis=0)
percentage_NaN_indivudual = Num_NaN_indivudual / (vcf_genotype.shape[0])
percentage_NaN_indivudual

CHROM    0.000000
POS      0.000000
ID       0.000000
REF      0.000000
ALT      0.000000
           ...   
943      0.012887
954      0.009107
972      0.002718
990      0.004842
99       0.000130
Length: 392, dtype: float64

In [31]:
# keep individuals that missingness is less than 20%
vcf_genotype = vcf_genotype.loc[:, percentage_NaN_indivudual < 0.2]
vcf_genotype

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,1024,...,916,918,920,924,926,943,954,972,990,99
0,1,752721,rs3131972,A,G,.,.,.,GT,2.0,...,1.0,1.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0
1,1,776546,rs12124819,A,G,.,.,.,GT,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0
2,1,798959,rs11240777,G,A,.,.,.,GT,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,1,838555,rs4970383,C,A,.,.,.,GT,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1,846808,rs4475691,C,T,.,.,.,GT,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454810,22,51171693,rs756638,G,A,.,.,.,GT,0.0,...,1.0,0.0,1.0,0.0,0.0,,1.0,0.0,1.0,1.0
454811,22,51175626,rs3810648,A,G,.,.,.,GT,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
454812,22,51178090,rs2285395,G,A,.,.,.,GT,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
454813,22,51181759,rs13056621;exm2268446,A,G,.,.,.,GT,2.0,...,2.0,1.0,2.0,1.0,1.0,0.0,1.0,2.0,2.0,2.0


# Save to file

In [32]:
# save to file
vcf_genotype.to_csv('user_data/lactose_intolerant.csv',
                    sep='\t',
                    index=None,
                    na_rep='NaN')