# Castro et al. (2019) Data Processing and Analysis

## Setup

In [1]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import re
import pickle
from collections import Counter
from functools import reduce

import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [3]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

# Varianta Data Loading

### Load in VCF data

In [5]:
vcf = VCFFile('../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz')

reading file '../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz'...
file '../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz' loaded.
total time to load VCF file: 9.54860243399938 mins.


Remove fixed sites — those that are not polymorphic in any samples / timepoints. These just needlessly shrink the covariance towards zero.

### Sample Data

The samples names to line/generation mapping was not in a simple text file, but I found the relevant information in the vcftools commands that were at the beginning of the file `Longshanks_F0F17.summary_stats.tar.gz`. From this I created `samples.txt`, which is read in and parsed below.

In [6]:
samples = pd.read_csv("../data/castro_et_al_2019/samples.txt", header=None, names = ('line', 'individual'))
sample_map = {k:v for k, v in zip(samples['individual'], samples['line'])}

subpop_indices = defaultdict(list)
for i, k in enumerate(vcf.samples):
    subpop_indices[sample_map[k.decode()]].append(i)

From this, we can map the `vcf.geno_mat` table to subpopulation counts. 

In [7]:
counts_mat = vcf.count_alleles_subpops(subpop_indices)

  self.mat = np.stack(counts_mat.values())


In [8]:
vcf.subpops

dict_keys(['Ctrl_F17', 'LS1_F17', 'LS2_F17', 'Ctrl_F0', 'LS1_F0', 'LS2_F0'])

Now we count the number of diploids in each sample.

In [9]:
ndiploids = [Counter(sample_map.values())[k] for k in vcf.subpops]

In [10]:
def parse_samples(x):
    line, gen = x.split('_')
    return (line, gen[1:])

design = [parse_samples(x) for x in vcf.subpops]

In [11]:
freq_mat_all = vcf.calc_freqs()

In [12]:
print("number of loci: ", freq_mat_all.shape[1])

number of loci:  31944210


With the frequencies calculated, now we filter out all non-segregating sites.

In [13]:
vcf.remove_fixed()
freq_mat = vcf.calc_freqs()
print("number of loci: ", freq_mat.shape[1])
print("loci not segregating removed: ", freq_mat_all.shape[1] - freq_mat.shape[1])

number of loci:  8162172
loci not segregating removed:  23782038


In [14]:
gi = vcf.build_gintervals()

## Replicate Covariance Analysis

In [15]:
tile_width = 10e6
gi.infer_seqlens()
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=tile_width)

In [16]:
d = TiledTemporalFreqs(tiles, freqs=freq_mat, depths=vcf.N, diploids=ndiploids, samples=design, gintervals=gi)

In [17]:
d.samples

[('Ctrl', '0'),
 ('Ctrl', '17'),
 ('LS1', '0'),
 ('LS1', '17'),
 ('LS2', '0'),
 ('LS2', '17')]

In [18]:
autosomes = list(set(gi.intervals.keys()) - set('chrX'))

In [19]:
covs_cis = d.bootstrap_cov(B=5000, keep_seqids=autosomes, average_replicates=False, progress_bar=True)

HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…




In [20]:
covs_cis

array([[[ 0.07777817, -0.01893739, -0.01577241],
        [-0.01893739,  0.10934695,  0.01307074],
        [-0.01577241,  0.01307074,  0.12341267]],

       [[ 0.10294169, -0.0069911 , -0.00235628],
        [-0.0069911 ,  0.13691365,  0.02631751],
        [-0.00235628,  0.02631751,  0.16493677]],

       [[ 0.11936342,  0.00654123,  0.01414081],
        [ 0.00654123,  0.15509356,  0.03939773],
        [ 0.01414081,  0.03939773,  0.19021948]]])

In [21]:
with open('../data/castro_et_al_2019/covs_bootstrap_10e6.npy', 'wb') as f:
    np.save(f, covs_cis)

### Bootstrap the Convergence Correlation

In [22]:
convergence_corr = d.bootstrap_convergence_corr(B=5000, progress_bar=True)

  sdmat = np.sqrt(varmat[:, tr, tc, :, :].mean(axis=1))


HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…




In [23]:
convergence_corr**2

array([[[[0.00019232]]],


       [[[0.00179993]]],


       [[[0.01122152]]]])

## Analysis Excluding Chromosomes 5 and 10

In [38]:
autosomes_sans_chr5_and_chr10 = [chr for chr in autosomes if chr not in ('chr5', 'chr10')]

In [40]:
covs_sans_chr5_and_chr10_cis = d.bootstrap_cov(B=5000, keep_seqids=autosomes_sans_chr5_and_chr10, 
                                              average_replicates=False, progress_bar=True)

  mean_hets = np.nanmean(hets, axis=freqs.ndim-1)
  avg = a.mean(axis)
  cov = np.cov(deltas, bias=True)
  c *= np.true_divide(1, fact)
  ave_bias += np.nanmean(0.5 * hets * (diploid_correction + depth_correction), axis=2)


HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…

In [41]:
covs_sans_chr5_and_chr10_cis

array([[[ 0.07912359, -0.02006789, -0.01685076],
        [-0.02006789,  0.10007415,  0.01004145],
        [-0.01685076,  0.01004145,  0.12769844]],

       [[ 0.10678867, -0.00736898, -0.00733722],
        [-0.00736898,  0.13543516,  0.02544975],
        [-0.00733722,  0.02544975,  0.14899192]],

       [[ 0.12436063,  0.01076925,  0.00994585],
        [ 0.01076925,  0.15168951,  0.03887876],
        [ 0.00994585,  0.03887876,  0.17238601]]])

In [42]:
with open('../data/castro_et_al_2019/covs_sans_chr5_and_chr10_bootstrap_10e6.npy', 'wb') as f:
    np.save(f, covs_sans_chr5_and_chr10_cis)