In [98]:
''' Manipulate Multi-Sample VCF files in Pandas (Python3). 
'''
from collections import OrderedDict
import pandas as pd
import numpy as np
import re
import sys
import os
import collections

# path to this %%file
if sys.platform == "win32":
    file_path = os.path.dirname(os.path.abspath("__file__"))+"\\"
else:
    file_path = os.path.dirname(os.path.abspath("__file__"))+"/"

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)


def vcf2dataframe(filename, genotype_level=True, info_level=True, UID=False):
    '''Open a VCF file and returns a MultiIndex pandas.DataFrame.
    Args:
        filename: vcf file to be converted to a dataframe
        genotype_level: place the genotype information into a second level column index
        info_level: place the info IDs into a second level column index
        UID: rename index to a unique variant identifier
    Notes:
        having any of these variables set to True will result
        in the DataFrame being generated very slowly. This is
        especially true for the UID variable.
    '''
    if filename.endswith(".gz"):
        raise IOError("pdVCF does not support compressed VCF files.")

    # get INFO fields and Headers as lists
    VCF_HEADER = get_vcf_header(filename)
    INFO_FIELDS = get_info_fields(filename)
    print(INFO_FIELDS)

    # Count how many comment lines should be skipped.
    comments = count_comments(filename)

    # Return a simple dataframe representative of the VCF data.
    df = pd.read_table(filename, skiprows=comments,
                       names=VCF_HEADER, usecols=range(len(VCF_HEADER)))

    if genotype_level:
        df = get_genotype_data(df)

    if info_level:
        df = get_info_data(df, INFO_FIELDS)

    if UID:
        df = index2UID(df)
    
    # replace empty cells
    df = df.replace("", np.nan)
    
    return df


def get_vcf_header(filename):
    ''' Get all header names from a given VCF file and return as a list.
    '''
    with open(filename) as input_file:
        row = [x for x in input_file if not x.startswith('##')] # skip unwanted headers
        head = next(iter(row))    # generator to deal with the header line only.
        split_head = [re.sub(r'#|\n', '', x) for x in head.split("\t")]
        return split_head


def get_info_fields(filename):
    ''' Get all ID names in the given VCFs INFO field and return as a list.
    '''
    with open(filename) as input_file:
        row = [x for x in input_file if x.startswith('##INFO')]
        info_fields = [x[11:].split(',')[0] for x in row]
        return info_fields



def count_comments(filename):
    ''' Count all lines in a given VCF file starting with #.
    '''
    comments = 0
    with open(filename) as f:
        for line in f:
            if line.startswith('#'):
                comments += 1

            else:
                break

    return comments



def replace_series_strings(df, col, dic, substring):
    ''' Replace the the keys with the items of the given
        dictionary for all strings or substrings in a
        given column
    Args:
        col: column name to replace strings
        dic: dictionary where the key is the string to replace with the item
        substrings: search and replace for either substrings (True) or exact strings (False)
    Returns:
        dataframe with the given column having all the
        entries identified as the key in the given dict
        replaced with the item in said dict
    '''
    if not isinstance(substring, bool):
        raise TypeError("substring argument must equal True or False")

    for string, correction in dic.items():
        if substring is True:
            df[col] = df[col].str.replace(string, correction)
        elif substring is False:
            df[col] = df[col].replace(string, correction, regex=True)

    return df



def get_genotype_data(df):
    ''' Give each sample column a second level column for every field
        detailed in the FORMAT column and return as a MultIndex dataframe.
    Args:
        df: DataFrame deriving from a VCF via vcf2dataframe()
    '''
    # contain the variant columns and the sample names in seperate lists
    normal = list(df.iloc[:, :9].columns)
    samples = list(df.iloc[:, 9:].columns)
    form = df['FORMAT'].str.split(":")[0]

    # These columns remain the same
    remain = pd.DataFrame(data=df[normal].values,
                          columns=pd.MultiIndex.from_tuples(
                            [(x, '') for x in normal] ))

    # list of dataframes where every sample has sub columns for each genotype info
    sams = [pd.DataFrame(data=list(df[col].str.split(':').dropna()),
                         columns=pd.MultiIndex.from_product([ [col], form ]))
            for col in samples]
    
    # add allele balance to sample genotype information
    sams = [calc_AB(sam) for sam in sams]
    
    # concat all dfs in the list
    df2 = pd.concat([remain] + sams, axis=1)

    return df2



def calc_AB(vcf):
    ''' Calculate allele balance for all samples in a given 
        pdVCF. Also converts DP & GQ to numeric type.
    
    Args:
        vcf: pdVCF with genotype information extracted
        
    Notes:
        ONLY WORKS FOR BIALLELIC VARIANTS
    '''
    sam = vcf.columns.levels[0][0]
    vcf[sam,'DP'] = pd.to_numeric(vcf[sam,'DP'])
    vcf[sam,'GQ'] = pd.to_numeric(vcf[sam,'GQ'])
    AD = vcf.xs('AD', level=1, axis=1).unstack().str.split(",", n=2)
    DP = vcf.xs('DP', level=1, axis=1).unstack()
    AB = round(pd.to_numeric(AD.str[1]) / pd.to_numeric(DP), 2)
    vcf[sam, 'AB'] = AB.tolist()
        
    return vcf



def get_info_data(df, info_fields):
    ''' Transform the INFO IDs into second level column indexes and return
        the df as a MultiIndex dataframe.
    Args:
        df: DataFrame deriving from a VCF via vcf2dataframe()
        info_fields: a list of all the INFO IDs in the given df
    '''
    # Alter Info field for some variables that don't work well
    df['INFO'] = df['INFO'].str.replace(";DB",";DB=1")
    df['INFO'] = df['INFO'].str.replace(";STR",";STR=1")

    # identify Info fields not present in each row and fill them with a 0
    for name in info_fields:
        if name == info_fields[0]:
            name = "{}=".format(name)
        else:
            name = ";{}=".format(name)

        not_present = df['INFO'][~df.INFO.str.contains(name)].add("{}0".format(name))
        present = df['INFO'][df.INFO.str.contains(name)]
        df['INFO'] = not_present.append(present).sort_index()

    # reorder INFO fields so they are are all in the same order
    df['INFO'] = df['INFO'].apply(lambda x: ';'.join(elem for elem in sorted(x.split(";"))))

    # remove all info_field names from the info values, starting with the info field with the longest name first
    unwanted = info_fields + ['=']
    unwanted.sort(key=len, reverse=True)
    remove = collections.OrderedDict([(x, '') for x in unwanted])
    df = replace_series_strings(df, col='INFO', dic=remove, substring=True)

    # create a new multi-index df containing only the info fields with the IDs as the second level
    info = pd.DataFrame(data=list(df['INFO'].str.split(';')),
                        columns=pd.MultiIndex.from_product([ ['INFO'], info_fields]))

    if not isinstance(df.columns, pd.MultiIndex):
        # create another multi-index df without the info fields where the second level is nothing
        df = pd.DataFrame(data=df.drop('INFO', axis=1).values,
                              columns=pd.MultiIndex.from_tuples(
                                [(x, '') for x in list(df.drop('INFO', axis=1).columns)] ))

    else:
        df = df.drop('INFO', axis=1)

    variant = df.iloc[:, :8]
    samples = df.iloc[:, 8:]

    # replace the info fields in the original df with the multi-index df created above
    final_df = pd.concat([variant] + [info] + [samples], axis=1)

    # MQ and MQ0 are in the wrong order so name swapping is required
    if 'MQ0' in info_fields and 'MQ' in info_fields:
        final_df = final_df.rename(columns={'MQ0': 'TEMP', 'MQ': 'MQ0'})
        final_df = final_df.rename(columns={'TEMP': 'MQ'})

    return final_df



def index2UID(df):
    ''' Replace the index with a unique variant identifier.
    '''
    if isinstance(df.columns, pd.MultiIndex):
        UID = df.apply(lambda x: "{}:{}-{}/{}".format(x['CHROM'][0], x['POS'][0],
                                                      x['REF'][0], x['ALT'][0]), axis=1)
    else:
         UID = df.apply(lambda x: "{}:{}-{}/{}".format(x['CHROM'], x['POS'],
                                                  x['REF'], x['ALT']), axis=1)

    df['UID'] = UID

    if df['UID'].value_counts()[0] > 1:
        raise ValueError("The UID is not unique.")
        
    # remove UID column (UID now only accessible via index)  
    df = df.drop('UID', axis=1)

    return df.rename(UID)


In [2]:
j = "var.both.taadUkJan2017.filters.vcf"
o = "var.both.taadUkOctoberRepeatLibrariesDec2016.filters.vcf"
t = "26PL1207.oct.noInDels.recode.vcf"

In [6]:
import time
start = time.time()

vcf2dataframe(j, genotype_level=False, 
             info_level=False, UID=False)

done = time.time() - start
print(done)

1.4514596462249756


In [137]:
vcf = vcf2dataframe(o, genotype_level=True, 
             info_level=True, UID=True)

#vcf[(vcf['21AI1224']['AB'] > 0.3) & (vcf['21AI1224']['DP'] > 49) & (vcf['21AI1224']['GQ'] > 29)]['21AI1224'].head()
vcf['INFO'].apply(pd.to_numeric, errors='ignore')['AF'].head()

#vcf['INFO']['AF'] > 12

['AC', 'AF', 'AN', 'BaseQRankSum', 'DB', 'DP', 'DS', 'Dels', 'ExcessHet', 'FS', 'HaplotypeScore', 'InbreedingCoeff', 'MLEAC', 'MLEAF', 'MQ', 'MQ0', 'MQRankSum', 'QD', 'RPA', 'RU', 'ReadPosRankSum', 'SOR', 'STR', 'set']




1:2160444-A/G    0.341
1:2160448-G/A    0.032
1:2160449-C/T    0.010
1:2160451-G/A    0.021
1:2160455-C/T    0.010
Name: AF, dtype: object

In [97]:

    

vcf = vcf2dataframe(t, genotype_level=True, 
              info_level=True, UID=True)
vcf



Unnamed: 0_level_0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,FORMAT,INFO,INFO,26PL1207,26PL1207,26PL1207,26PL1207,26PL1207,26PL1207,22MI1099,22MI1099,22MI1099,22MI1099,22MI1099,22MI1099
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,AC,LOLZ,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB
1:1000-A/G,1,1000,.,A,G,24.89,LowQual,GT:AD:DP:GQ:PL,2,9,0/1,101,11,3,30197,0.09,./.,,,,,
1:2234385-C/T,1,2234385,.,C,T,9953.67,PASS,GT:AD:DP:GQ:PL,1,90,0/1,511,16,99,2890103,0.69,0/1,101.0,11.0,3.0,30197.0,0.09
1:2235243-C/T,1,2235243,.,C,T,82.06,PASS,GT:AD:DP:GQ:PL,0,0,0/1,41,5,16,160102,0.2,0/1,40100.0,140.0,50.0,15001888.0,0.71
1:2235792-A/G,1,2235792,.,A,G,436.66,QDfilterSNV,GT:AD:DP:GQ:PL,0,0,0/1,9317,110,99,14501894,0.15,0/0,5050.0,100.0,89.0,1000100.0,0.5
1:2235901-A/G,1,2235901,.,A,G,24.89,LowQual,GT:AD:DP:GQ:PL,0,0,0/1,101,11,3,30197,0.09,1/1,200200.0,400.0,90.0,1000100.0,0.5
"1:2239999-A/G,T",1,2239999,.,A,"G,T",24.89,LowQual,GT:AD:DP:GQ:PL,0,0,0/1,1014,15,3,30197,0.07,0/1,10020100.0,220.0,90.0,10005001000.0,0.09
1:2235501-A/GGT,1,2235501,.,A,GGT,24.89,LowQual,GT:AD:DP:GQ:PL,0,0,0/1,101,11,3,30197,0.09,1/1,200200.0,400.0,90.0,1000100.0,0.5
1:2239901-GA/G,1,2239901,.,GA,G,24.89,LowQual,GT:AD:DP:GQ:PL,0,0,0/1,101,11,3,30197,0.09,1/1,200200.0,400.0,90.0,1000100.0,0.5


In [150]:
import numpy as np


class VCF(object):
    ''' A VCF file stored as a Pandas DataFrame
    
    Atrributes:
        vcf: vcf file to be converted to a Pandas DataFrame or a VCF object
        convert: if True convert to pandas DataFrame, False assumes vcf is already a VCF object
        genotype_level: place the genotype information into a second level column index
        info_level: place the info IDs into a second level column index
        UID: rename index to a unique variant identifier
    '''
    def __init__(self, vcf, convert=True, genotype_level=True, info_level=True, UID=True):
        
        if convert:
            self.vcf = vcf2dataframe(vcf, genotype_level=genotype_level,
                                     info_level=info_level, UID=UID)
        else:
            self.vcf = vcf
        
        
    def get_genotype(self, gen):
        ''' Access specific genotype information across samples
            in the vcf.
        Args:
            gen: genotype attribute of interest in string format e.g 'DP'
        '''
        num_info = self.vcf['INFO'].columns.shape[0]
        genotype = self.vcf.ix[:,9+num_info:]
        return genotype.xs(gen, level=1, axis=1)

    
    def get_info(self, info):
        ''' Return INFO field of interest e.g. 'AC'
        '''
        return self.vcf['INFO'][info]



class FilterVCF(VCF):
    ''' A VCF file which can be readily filtered
    
    Attributes:
        vcf: Pandas DataFrame VCF 
    '''
    def __init__(self, vcf):
        #self.vcf = vcf
        VCF.__init__(self, vcf, convert=False)


    def subset(self, sams, exclude_ref=False, remove_uncalled=True):
        ''' Subset a multisample VCF by a given samples.
        Args:
            vcf: Pandas DataFrame VCF
            sams: list of samples to subset the vcf for
            exlude_ref: remove variant if all GT values for subset are 0/0
            remove_uncalled: remove variant if all GT values for subset are ./.

        Returns:
            subsetted Pandas DataFrame VCF
        '''
        # split variant and genotype information 
        genotype = self.vcf[sams]
        num_info = self.vcf['INFO'].columns.shape[0]
        variant = self.vcf.ix[:,:8+num_info]

        GT = genotype.xs('GT', level=1, axis=1)
        uncalled= []

        if remove_uncalled:
            uncalled = GT[GT[sams] == './.'].dropna().index.tolist() 

        if exclude_ref:
            uncalled += GT[GT[sams] == '0/0'].dropna().index.tolist() 

        sub = pd.concat([variant, genotype], axis=1)
        self.vcf = sub.drop(uncalled)
        return self.vcf

    
    def filter_genotype(self, minDP=None, minGQ=None, minAB=None):
        ''' Filter for variants in which all the samples in the given vcf 
            meet the minimum genotype values given.
        
        Args:
            minDP: minimum variant depth
            minGQ: minimum genotype quality
            minAB: minimum allele balance
        
        Notes:
            Doesn't handle multiallelic information properly and
            will filter for this first ALT value e.g. if DP = 12,1,100
            it will be filtered out even if minDP=30.
        '''
        # split variant and genotype information
        num_info = self.vcf['INFO'].columns.shape[0]
        variant = self.vcf.ix[:,:8+num_info]
        genotype = self.vcf.ix[:,9+num_info:]

        # store all variants that don't meet the minimum value given for the args here
        below_min = []
        
        if minDP:
            DP = genotype.xs('DP', level=1, axis=1).fillna(0)
            above_min = DP[DP >= minDP] 
            below_min += DP[above_min.isnull().any(axis=1)].index.tolist()
            
        if minGQ:
            GQ = genotype.xs('GQ', level=1, axis=1).fillna(0)
            above_min = GQ[GQ >= minGQ]
            below_min +=  GQ[above_min.isnull().any(axis=1)].index.tolist()
        
        if minAB:
            AB = genotype.xs('AB', level=1, axis=1).fillna(0)
            above_min = AB[AB >= minAB]
            below_min +=  AB[above_min.isnull().any(axis=1)].index.tolist()
        
        # remove variants that don't meet the requirements from the vcf
        self.vcf = self.vcf.drop(below_min)
        return self.vcf
    
    
    def filter_info(self, field, value):
        ''' Filter for variants that are above the given value
            (if value is number) or are equal to the given value
            (if value is string).
            
        Args:
            field: INFO field of interest
            value: string or int value to test the field with
            
        Notes:
            Doesn't handle multiallelic information properly and
            will filter any thing that has this e.g. if AC = 12,34
            it will be filtered out even if value=1.
        '''
        if isinstance(value, int) or isinstance(value, float):
            mask = pd.to_numeric(self.vcf['INFO'][field], errors='coerrce') >= value
            self.vcf = self.vcf[mask]
            return self.vcf
        
        elif isinstance(value, str):
            mask= self.vcf['INFO'][field] == value
            self.vcf = self.vcf[mask]
            return self.vcf
        
    
    def remove_indels(self):
        ''' Remove indels from vcf.
        '''
        alt_mask = (self.vcf.ALT.str.len() == 1) | (self.vcf.ALT.str.contains(','))
        ref_mask = (self.vcf.REF.str.len() == 1) | (self.vcf.REF.str.contains(','))
        return(self.vcf[alt_mask & ref_mask])
    
    
    def biallelic(self):
        ''' Filter for biallelic variants only.
        '''
        self.vcf = self.vcf[self.vcf.ALT.str.split(',').str.len() == 1]
        return self.vcf
    
    
    def multiallelic(self):
        ''' Filter for multiallelic variants only.
        '''
        self.vcf = self.vcf[self.vcf.ALT.str.split(',').str.len() > 1]
        return self.vcf
        
    


In [156]:
v = VCF(o, genotype_level=True, 
    info_level=True, UID=True)


test_vcf = FilterVCF(v.vcf)

#mask = (test_vcf.vcf.REF.str.len() == 1) & (test_vcf.vcf.ALT.str.len() == 1)
#test_vcf.vcf[mask]
#test_vcf.vcf[test_vcf.vcf.ALT.str.split(',').str.len() == 1]
#test_vcf.remove_indels()
#test_vcf.vcf['INFO'].apply(pd.to_numeric)
#test_vcf.get_genotype('DP')['22MI1099']
test_vcf.filter_info('MLEAF', 0.1).head()
#test_vcf.vcf

['AC', 'AF', 'AN', 'BaseQRankSum', 'DB', 'DP', 'DS', 'Dels', 'ExcessHet', 'FS', 'HaplotypeScore', 'InbreedingCoeff', 'MLEAC', 'MLEAF', 'MQ', 'MQ0', 'MQRankSum', 'QD', 'RPA', 'RU', 'ReadPosRankSum', 'SOR', 'STR', 'set']




Unnamed: 0_level_0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,FORMAT,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,INFO,21AI1224,21AI1224,21AI1224,21AI1224,21AI1224,21AI1224,21IB1281,21IB1281,21IB1281,21IB1281,21IB1281,21IB1281,21IL1279,21IL1279,21IL1279,21IL1279,21IL1279,21IL1279,21JH1219,21JH1219,21JH1219,21JH1219,21JH1219,21JH1219,21JL1278,21JL1278,21JL1278,21JL1278,21JL1278,21JL1278,21KC1280,21KC1280,21KC1280,21KC1280,21KC1280,21KC1280,21PB1303,21PB1303,21PB1303,21PB1303,21PB1303,21PB1303,21PS1217,21PS1217,21PS1217,21PS1217,21PS1217,21PS1217,21QA1105,21QA1105,21QA1105,21QA1105,21QA1105,21QA1105,21WG1262,21WG1262,21WG1262,21WG1262,21WG1262,21WG1262,22AW1185,22AW1185,22AW1185,22AW1185,22AW1185,22AW1185,22BB1282,22BB1282,22BB1282,22BB1282,22BB1282,22BB1282,22BW1284,22BW1284,22BW1284,22BW1284,22BW1284,22BW1284,22GK1188,22GK1188,22GK1188,22GK1188,22GK1188,22GK1188,22KF1283,22KF1283,22KF1283,22KF1283,22KF1283,22KF1283,22VW1151,22VW1151,22VW1151,22VW1151,22VW1151,22VW1151,26BH1161,26BH1161,26BH1161,26BH1161,26BH1161,26BH1161,26BS1176,26BS1176,26BS1176,26BS1176,26BS1176,26BS1176,26BW1201,26BW1201,26BW1201,26BW1201,26BW1201,26BW1201,26CB1202,26CB1202,26CB1202,26CB1202,26CB1202,26CB1202,26CS1154,26CS1154,26CS1154,26CS1154,26CS1154,26CS1154,26CS1165,26CS1165,26CS1165,26CS1165,26CS1165,26CS1165,26DA1193,26DA1193,26DA1193,26DA1193,26DA1193,26DA1193,26DL1132,26DL1132,26DL1132,26DL1132,26DL1132,26DL1132,26ED1139,26ED1139,26ED1139,26ED1139,26ED1139,26ED1139,26EE1157,26EE1157,26EE1157,26EE1157,26EE1157,26EE1157,26ES1140,26ES1140,26ES1140,26ES1140,26ES1140,26ES1140,26GH1181,26GH1181,26GH1181,26GH1181,26GH1181,26GH1181,26GW1196,26GW1196,26GW1196,26GW1196,26GW1196,26GW1196,26ID1179,26ID1179,26ID1179,26ID1179,26ID1179,26ID1179,26IT1169,26IT1169,26IT1169,26IT1169,26IT1169,26IT1169,26JB1205,26JB1205,26JB1205,26JB1205,26JB1205,26JB1205,26JG1215,26JG1215,26JG1215,26JG1215,26JG1215,26JG1215,26JM1155,26JM1155,26JM1155,26JM1155,26JM1155,26JM1155,26JW1182,26JW1182,26JW1182,26JW1182,26JW1182,26JW1182,26KE1137,26KE1137,26KE1137,26KE1137,26KE1137,26KE1137,26LG1175,26LG1175,26LG1175,26LG1175,26LG1175,26LG1175,26LM1174,26LM1174,26LM1174,26LM1174,26LM1174,26LM1174,26MH1213,26MH1213,26MH1213,26MH1213,26MH1213,26MH1213,26PH1135,26PH1135,26PH1135,26PH1135,26PH1135,26PH1135,26PL1207,26PL1207,26PL1207,26PL1207,26PL1207,26PL1207,26RG1162,26RG1162,26RG1162,26RG1162,26RG1162,26RG1162,26RN1152,26RN1152,26RN1152,26RN1152,26RN1152,26RN1152,26SH1138,26SH1138,26SH1138,26SH1138,26SH1138,26SH1138,26SH1192,26SH1192,26SH1192,26SH1192,26SH1192,26SH1192,26SS1216,26SS1216,26SS1216,26SS1216,26SS1216,26SS1216,26TS1203,26TS1203,26TS1203,26TS1203,26TS1203,26TS1203,Blank-0-161011,Blank-0-161011,Blank-0-161011,Blank-0-161011,Blank-0-161011,Blank-0-161011
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,AC,AF,AN,BaseQRankSum,DB,DP,DS,Dels,ExcessHet,FS,HaplotypeScore,InbreedingCoeff,MLEAC,MLEAF,MQ0,MQ,MQRankSum,QD,RPA,RU,ReadPosRankSum,SOR,STR,set,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB,GT,AD,DP,GQ,PL,AB
1:2160444-A/G,1,2160444,.,A,G,391.52,PASS,GT:AD:DP:GQ:PL,15,0.341,44,-2.391,0,53,0,0.0,0.4046,0.0,1.2037,-0.1143,12,0.273,0,59.44,0.0,27.96,0,0,-3.524,0.237,0,variant,0/0,10.0,1.0,3.0,335.0,0.0,0/1,12.0,3.0,25.0,25028.0,0.67,0/1,11,2.0,24.0,25024,0.5,./.,,,,,,0/0,31,5.0,9.0,998.0,0.2,./.,,,,,,0/0,30,4.0,6.0,669,0.0,0/0,10,1.0,3.0,330,0.0,./.,,,,,,./.,,,,,,./.,,,,,,./.,.,1.0,,,,0/0,10.0,1.0,3.0,334.0,0.0,0/1,21.0,3.0,23.0,23056.0,0.33,./.,,,,,,0/0,10,1.0,3.0,334,0.0,./.,,,,,,1/1,1,1.0,3.0,3230,1.0,0/1,11,2.0,26.0,26029,0.5,./.,,,,,,0/0,20.0,2.0,3.0,331.0,0.0,./.,.,2.0,,,,./.,,,,,,./.,,,,,,./.,,,,,,1/1,1,1.0,3.0,3130,1.0,0/1,11.0,2.0,25.0,25029.0,0.5,./.,.,1.0,,,,./.,,,,,,0/1,11.0,5.0,25.0,25029.0,0.2,./.,,,,,,./.,,,,,,./.,,,,,,0/0,10.0,2.0,3.0,335.0,0.0,./.,,,,,,./.,,,,,,0/0,10,1.0,3.0,335,0.0,0/1,11,2.0,25.0,25028,0.5,1/1,1,1.0,3.0,3130,1.0,./.,,,,,,./.,,,,,,0/0,10.0,2.0,3.0,335.0,0.0,1/1,1,1.0,3.0,3130,1.0,0/0,20,2.0,3.0,335,0.0,./.,,,,,,./.,,,,,,./.,,,,,,./.,,,,,
1:2234376-TG/T,1,2234376,.,TG,T,84.3,PASS,GT:AD:DP:GQ:PL,9,0.145,62,0.395,0,52,0,0.0,1.5002,0.0,0.0,-0.161,9,0.145,0,60.0,0.0,21.07,32,G,2.467,0.05,1,variant2,1/1,1.0,1.0,3.0,3530.0,1.0,./.,,,,,,0/1,22,4.0,34.0,34072,0.5,./.,,,,,,0/0,10,1.0,3.0,343.0,0.0,0/0,10.0,1.0,3.0,344.0,0.0,0/0,10,1.0,3.0,333,0.0,0/0,10,1.0,3.0,341,0.0,0/1,11.0,2.0,30.0,30037.0,0.5,./.,,,,,,0/0,10.0,1.0,3.0,345.0,0.0,0/0,10,1.0,3.0,327.0,0.0,./.,,,,,,./.,,,,,,./.,,,,,,0/1,11,2.0,18.0,29018,0.5,0/0,10.0,1.0,3.0,342.0,0.0,0/0,10,1.0,3.0,336,0.0,0/0,10,1.0,3.0,317,0.0,0/0,20.0,2.0,6.0,661.0,0.0,./.,,,,,,0/0,10,1.0,3.0,344.0,0.0,0/0,10.0,1.0,3.0,343.0,0.0,0/0,10.0,1.0,3.0,344.0,0.0,./.,,,,,,0/0,20,2.0,6.0,683,0.0,./.,,,,,,0/0,20,2.0,6.0,686.0,0.0,0/0,10.0,1.0,3.0,343.0,0.0,./.,,,,,,0/1,11.0,2.0,2.0,2026.0,0.5,0/0,10.0,1.0,3.0,343.0,0.0,./.,,,,,,./.,,,,,,0/0,20.0,2.0,6.0,689.0,0.0,0/0,40.0,4.0,12.0,12171.0,0.0,0/0,10,1.0,3.0,345,0.0,0/0,10,1.0,3.0,343,0.0,0/0,10,1.0,3.0,313,0.0,./.,,,,,,0/1,11.0,2.0,1.0,1039.0,0.5,./.,,,,,,0/1,11,2.0,18.0,28018,0.5,0/1,31,4.0,3.0,30114,0.25,./.,,,,,,./.,,,,,,0/0,40.0,4.0,12.0,12129.0,0.0,./.,,,,,
1:2234384-A/C,1,2234384,.,A,C,109.38,PASS,GT:AD:DP:GQ:PL,8,0.138,58,-4.308,0,60,0,0.0,0.0054,0.0,1.8463,0.1015,6,0.103,0,60.0,0.0,28.6,0,0,2.155,0.053,0,variant,./.,,,,,,./.,,,,,,0/0,31,4.0,9.0,9106,0.25,./.,,,,,,./.,.,1.0,,,,0/0,10.0,1.0,3.0,335.0,0.0,1/1,1,1.0,3.0,3030,1.0,0/0,10,1.0,3.0,335,0.0,./.,,,,,,./.,,,,,,0/0,20.0,2.0,6.0,671.0,0.0,0/0,10,1.0,3.0,335.0,0.0,./.,,,,,,./.,,,,,,./.,,,,,,0/0,20,2.0,6.0,671,0.0,0/0,10.0,1.0,3.0,335.0,0.0,0/0,20,2.0,6.0,671,0.0,0/0,10,1.0,3.0,335,0.0,0/1,11.0,2.0,24.0,24029.0,0.5,0/0,10.0,1.0,3.0,335.0,0.0,./.,,,,,,0/0,10.0,1.0,3.0,335.0,0.0,1/1,1.0,1.0,3.0,3030.0,1.0,./.,,,,,,0/0,20,2.0,3.0,331,0.0,./.,,,,,,0/0,20,2.0,6.0,671.0,0.0,0/0,10.0,1.0,3.0,335.0,0.0,./.,,,,,,1/1,2.0,2.0,3.0,3030.0,1.0,0/0,30.0,3.0,6.0,668.0,0.0,./.,,,,,,./.,,,,,,0/0,20.0,2.0,6.0,669.0,0.0,0/0,40.0,4.0,12.0,12138.0,0.0,0/0,10,1.0,3.0,335,0.0,0/0,10,1.0,3.0,335,0.0,0/0,30,3.0,9.0,9107,0.0,./.,,,,,,0/0,10.0,2.0,3.0,334.0,0.0,./.,,,,,,0/0,20,2.0,6.0,671,0.0,0/0,31,4.0,9.0,997,0.25,0/0,10.0,1.0,3.0,335.0,0.0,./.,,,,,,0/1,21.0,4.0,24.0,24028.0,0.25,./.,,,,,
1:2234385-C/T,1,2234385,.,C,T,9953.67,PASS,GT:AD:DP:GQ:PL,50,0.532,94,5.852,0,540,0,0.0,99.7449,0.0,1.4681,-0.8584,50,0.532,0,60.0,0.0,18.5,0,0,-7.493,1.878,0,variant,0/1,36.0,9.0,64.0,162064.0,0.67,0/1,23.0,5.0,47.0,77047.0,0.6,0/1,613,19.0,99.0,3460124,0.68,0/1,29.0,11.0,30.0,251030.0,0.82,0/1,38,11.0,58.0,210058.0,0.73,0/1,26.0,8.0,37.0,159037.0,0.75,0/1,78,15.0,99.0,2000167,0.53,0/1,210,12.0,23.0,272023,0.83,0/1,311.0,14.0,20.0,297020.0,0.79,1/1,10.0,10.0,30.0,317300.0,1.0,0/1,57.0,12.0,99.0,1790120.0,0.58,0/1,18,9.0,4.0,21804.0,0.89,1/1,4.0,4.0,12.0,126120.0,1.0,0/1,510.0,15.0,99.0,2690110.0,0.67,0/1,83.0,11.0,59.0,590212.0,0.27,0/1,26,9.0,37.0,162037,0.67,0/1,58.0,13.0,86.0,209086.0,0.62,0/1,312,15.0,50.0,331050,0.8,0/1,411,15.0,78.0,304078,0.73,0/1,611.0,17.0,99.0,2850131.0,0.65,0/1,38.0,11.0,59.0,212059.0,0.73,0/1,412,16.0,71.0,319071.0,0.75,0/1,33.0,6.0,74.0,74074.0,0.5,0/1,16.0,7.0,8.0,16608.0,0.86,0/1,25.0,7.0,42.0,104042.0,0.71,0/1,312,15.0,47.0,325047,0.8,0/1,39.0,12.0,56.0,239056.0,0.75,0/1,29,11.0,30.0,253030.0,0.82,0/1,16.0,7.0,10.0,165010.0,0.86,0/1,312.0,15.0,50.0,294050.0,0.8,0/1,512.0,17.0,97.0,327097.0,0.71,0/1,49.0,14.0,82.0,236082.0,0.64,0/1,44.0,8.0,98.0,98098.0,0.5,1/1,8.0,8.0,24.0,245240.0,1.0,0/1,45.0,9.0,93.0,128093.0,0.56,0/1,32.0,5.0,49.0,52049.0,0.4,0/1,66,12.0,99.0,1540152,0.5,0/1,14,5.0,16.0,107016,0.8,0/1,68,14.0,99.0,2030144,0.57,0/1,16.0,7.0,10.0,168010.0,0.86,0/1,511.0,16.0,99.0,2890103.0,0.69,0/1,38.0,11.0,60.0,219060.0,0.73,0/1,39,12.0,55.0,241055,0.75,0/1,45,9.0,90.0,127090,0.56,0/1,417.0,21.0,61.0,471061.0,0.81,0/1,76.0,13.0,99.0,1470147.0,0.46,0/1,612.0,18.0,98.0,317098.0,0.67,./.,,,,,
1:2234903-C/T,1,2234903,rs2256178,C,T,44102.29,PASS,GT:AD:DP:GQ:PL,20,0.213,94,57.169,1,8252,0,0.0,1.052,0.0,12.4685,0.1374,20,0.213,0,59.97,0.043,14.87,0,0,-0.038,0.723,0,variant,0/1,7196.0,167.0,63.0,2900063.0,0.57,0/1,9177.0,170.0,99.0,21970273.0,0.45,0/1,9674,172.0,55.0,2262055,0.43,0/0,1820.0,182.0,99.0,108919.0,0.0,1/1,71101,172.0,99.0,31701630.0,0.59,0/0,1781.0,179.0,99.0,1281236.0,0.01,1/1,9088,178.0,26.0,2623260,0.49,0/0,1830,183.0,81.0,81685,0.0,0/1,10568.0,173.0,99.0,19850236.0,0.39,0/0,1760.0,176.0,99.0,4923774.0,0.0,0/0,1820.0,182.0,93.0,93763.0,0.0,0/0,1690,169.0,51.0,51408.0,0.0,0/0,1700.0,171.0,66.0,66539.0,0.0,0/1,84100.0,184.0,46.0,2868046.0,0.54,0/0,1660.0,166.0,51.0,51438.0,0.0,0/1,8696,183.0,53.0,2814053,0.52,0/0,1700.0,171.0,30.0,30243.0,0.0,0/0,1841,185.0,99.0,1101134,0.01,0/1,79101,180.0,99.0,29440294,0.56,0/1,82104.0,186.0,49.0,3033049.0,0.56,0/0,1790.0,179.0,75.0,75635.0,0.0,0/0,1760,176.0,57.0,57481.0,0.0,0/1,8884.0,172.0,72.0,2488072.0,0.49,0/0,1770.0,178.0,87.0,87760.0,0.0,0/0,1771.0,178.0,23.0,23421.0,0.01,0/1,8586,171.0,4.0,260804,0.5,1/1,8486.0,171.0,94.0,2661940.0,0.5,0/0,1761,177.0,76.0,76910.0,0.01,0/1,10269.0,171.0,99.0,20190255.0,0.4,0/0,1760.0,176.0,81.0,81681.0,0.0,0/1,8687.0,173.0,99.0,24820111.0,0.5,0/0,1760.0,176.0,99.0,1431117.0,0.0,0/1,7196.0,167.0,99.0,27550109.0,0.57,0/0,1781.0,180.0,99.0,1241199.0,0.01,0/0,1810.0,181.0,99.0,114928.0,0.0,0/0,1770.0,178.0,99.0,1371104.0,0.0,0/1,9587,182.0,99.0,24750139,0.48,0/0,1730,173.0,99.0,1261006,0.0,0/0,1730,173.0,81.0,81683,0.0,0/0,1790.0,179.0,99.0,102853.0,0.0,0/0,1710.0,171.0,57.0,57460.0,0.0,0/0,1720.0,172.0,81.0,81686.0,0.0,0/0,1700,170.0,81.0,81627,0.0,0/0,1680,168.0,72.0,72616,0.0,0/0,1740.0,175.0,81.0,81657.0,0.0,0/0,1761.0,177.0,40.0,40579.0,0.01,0/0,1781.0,179.0,78.0,78862.0,0.01,./.,,,,,


In [28]:

v = VCF(t, genotype_level=True, 
    info_level=True, UID=True)


test_vcf = FilterVCF(v.vcf)
test_vcf.get_genotype('DP')

#test_vcf.filter_genotype(minDP=10, minAB=0.1)#, minAB=0.1)
test_vcf.vcf#.loc[['1:1000-A/G']]

test_vcf.get_genotype('DP')

test_vcf.subset(['22MI1099'], remove_uncalled=True, exclude_ref=True)
#test_vcf.genotype(minDP=12, minGQ=51)
test_vcf.vcf



Unnamed: 0_level_0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,FORMAT,INFO,INFO,22MI1099,22MI1099,22MI1099,22MI1099,22MI1099,22MI1099
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,AC,LOLZ,GT,AD,DP,GQ,PL,AB
1:2234385-C/T,1,2234385,.,C,T,9953.67,PASS,GT:AD:DP:GQ:PL,1.0,90,0/1,101,11.0,3.0,30197,0.09
1:2235243-C/T,1,2235243,.,C,T,82.06,PASS,GT:AD:DP:GQ:PL,0.0,0,0/1,40100,140.0,50.0,15001888,0.71
1:2235901-A/G,1,2235901,.,A,G,24.89,LowQual,GT:AD:DP:GQ:PL,0.0,0,1/1,200200,400.0,90.0,1000100,0.5
"1:2239999-A/G,T",1,2239999,.,A,"G,T",24.89,LowQual,GT:AD:DP:GQ:PL,0.0,0,0/1,10020100,220.0,90.0,10005001000,0.09
1:2235501-A/GGT,1,2235501,.,A,GGT,24.89,LowQual,GT:AD:DP:GQ:PL,0.0,0,1/1,200200,400.0,90.0,1000100,0.5
1:2239901-GA/G,1,2239901,.,GA,G,24.89,LowQual,GT:AD:DP:GQ:PL,0.0,0,1/1,200200,400.0,90.0,1000100,0.5


1:1000-A/G         9 
1:2234385-C/T      90
1:2235243-C/T      0 
1:2235792-A/G      0 
1:2235901-A/G      0 
1:2239999-A/G,T    0 
Name: LOLZ, dtype: object