# Input file descriptions:

The data required are

- Allele counts for various sampling units with the following requirements:
    - This should be a matrix with one row per geographical sampling site and one column per locus
    - The markers should be bi-allelic co-dominant or dominant.
    - The sampling units can be either individuals or groups of individuals observed at the same site
    - Missing data are allowed
- Sample sizes
    - This should be a matrix with one row per geographical sampling site and one column per locus
    - This should be haploid sample sizes, so two times the number of individuals for diploid organisms and so on.
- Spatial coordinates of the sampling units.
    - This should be a matrix with two columns and one row per sampling site. It can be Lon-Lat coordinates or UTM - coordinates.
- Measurements of environmental variables at the same geographcial locations as genetic data.
    - This should be a matrix with one row per sampling site and one column per environmental variable.


In [1]:
# %matplotlib inline
%matplotlib
import re
from collections import defaultdict


import numpy as np
import pandas as pd
import seaborn as sns

import ggplot as gp
import matplotlib.pyplot as plt

import vcf

Using matplotlib backend: TkAgg


In [2]:
# Paths
vcf_path = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.maf0_05.OT_MS_NB_indv.recode.vcf"

ginland_dir = "/home/gus/data/ddrad/gINLAnd_input"

allele_count_path = ginland_dir + "/allele_count"
sample_sizes_path = ginland_dir + "/sample_sizes"
site_coords_path = ginland_dir + "/site_coords"
environmental_data_path = ginland_dir + "/environmental_data"

In [3]:
# helper functions

def nested_defaultdict():
    return defaultdict(nested_defaultdict)

# Prepare allele_count file

- Allele counts for various sampling units with the following requirements:
    - This should be a matrix with one row per geographical sampling site and one column per locus
    - The markers should be bi-allelic co-dominant or dominant.
    - The sampling units can be either individuals or groups of individuals observed at the same site
    - Missing data are allowed
    
## Notes:

- each "cell" in the table is the count of alternative base at that site for all members of the sample site.
    - homo-REF = 0
    - hetero = 1
    - homo-ALT = 2

-----------

# Prepare sample_sizes file

- Sample sizes
    - This should be a matrix with one row per geographical sampling site and one column per locus
    - This should be haploid sample sizes, so two times the number of individuals for diploid organisms and so on.
    
## Notes:

- this is important because not all individuals have called SNPs at all loci and the `allele_count` data needs to know which loci have how many chroms to get the ALT:REF ratio correct

--------------

In [63]:
vcf_reader = vcf.Reader(open(vcf_path, 'r'))

In [6]:
r = vcf_reader.next()
print r

Record(CHROM=Scaffold0, POS=13388, REF=T, ALT=[C])


In [8]:
s = r.samples[0]
s

Call(sample=MS11_0001, CallData(GT=1/1, PL=[255, 72, 0], DP=24, SP=0, GQ=73))

In [89]:
s.called

True

In [87]:
def vcf_to_allele_count_and_sample_sizes(vcf_path):
    """Generate dataframes with per-locus allele_count and sample_sizes gINLAnd data.

    Args:
        vcf_path (str): Path to source VCF

    Returns:
        allele_count (pandas.DataFrame)
        sample_sizes (pandas.DataFrame)

    """
    allele_count_dict = nested_defaultdict()
    sample_sizes_dict = nested_defaultdict()
    site_members = map_site_members_to_site_code(vcf_path=vcf_path)
    site_codes = tuple(set(site_members.values()))
    vcf_reader = vcf.Reader(open(vcf_path, 'r'))
    
    for snp_rec in vcf_reader:
        chrom_pos = init_nested_dicts_for_locus(allele_count_dict, sample_sizes_dict, snp_rec, site_codes)

        for sample in snp_rec.samples:
            sample_name = sample.sample
            sample_site = site_members[sample_name]

            try:
                allele_count_dict[chrom_pos][sample_site] += sum_hap_gt(sample=sample)
                sample_sizes_dict[chrom_pos][sample_site] += 2
            except TypeError:
                pass
                
    allele_count = pd.DataFrame(data=allele_count_dict)
    sample_sizes = pd.DataFrame(data=sample_sizes_dict)
    
    return allele_count,sample_sizes


def map_site_members_to_site_code(vcf_path):
    """maps site members to site codes.

    Args:
        vcf_path (str): Path to source VCF

    Returns:
        site_members (dict): `dict` containing individual site codes as `key` and group name as `value`

    """
    
    vcf_reader = vcf.Reader(open(vcf_path, 'r'))
    
    site_members = defaultdict(str)
    
    for sample in vcf_reader.samples:
        site_members[sample] = sample[:2]
        
    return site_members


def init_nested_dicts_for_locus(allele_count_dict, sample_sizes_dict, snp_rec, site_codes):
    chrom_pos = "{chrom}:{pos}".format(chrom=snp_rec.CHROM,pos=snp_rec.POS)
    for site in site_codes:
        allele_count_dict[chrom_pos][site] = 0
        sample_sizes_dict[chrom_pos][site] = 0
    return chrom_pos


def sum_hap_gt(sample):
    gt = sample.data.GT
    assert '/' in gt
    
    hap_gts = [int(hap) for hap in gt.split('/')]
    assert set(hap_gts).issubset(set([0,1]))
    
    return sum(hap_gts)

In [86]:
allele_count = vcf_to_allele_count(vcf_path=vcf_path)