# Kelly and Hughes (2019) EDA and Covariance Analysis

## Setup

In [1]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import re
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [3]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

## Varianta Data Loading

### Load in TSV data and reshape

In [64]:
keep_seqids = ['2L', '2R', '3L', '3R', 'X']
keep_autos = ['2L', '2R', '3L', '3R']

In [65]:
rawdata = pd.read_csv('../data/kelly_hughes_2019/kelly_hughes_2019.tsv', delimiter='\t')
samples = ['A0', 'A7', 'B0', 'B7', 'C0', 'C7']
freqs = rawdata[[f"{samp}_pr" for samp in samples]].values.T
depths = rawdata[[f"{samp}_reads" for samp in samples]].values.T

In [66]:
gi = GenomicIntervals()
for row in rawdata.itertuples(index=False):
    seqid = row[0].replace('Scf_', '')
    gi.append(seqid, int(row[1]))

In [67]:
gi.infer_seqlens()

### Study Design

The design is three replicates (1, 2, 3) and two timepoints (0 and 7). There are 14 generations — the "7" comes from the seven months that the evolution experiment was run (see p. 945).

In [68]:
samples = [('A', 0), ('A', 7), ('B', 0), ('B', 7), ('C', 0), ('C', 7)]


## Replicate Covariance Analysis

In [69]:
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=1e6)

In [70]:
d = TiledTemporalFreqs(tiles, freqs=freqs, depths=depths, diploids=1000, gintervals=gi, samples=samples)

In [71]:
d.samples

[('A', 0), ('A', 7), ('B', 0), ('B', 7), ('C', 0), ('C', 7)]

In [101]:
covs_cis = d.bootstrap_cov(B=5000, progress_bar=True, average_replicates=False)

HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…




In [102]:
covs_cis

array([[[0.01765399, 0.00553288, 0.00559607],
        [0.00553288, 0.01771579, 0.00526536],
        [0.00559607, 0.00526536, 0.01738812]],

       [[0.01900866, 0.00678605, 0.00682915],
        [0.00678605, 0.01894591, 0.00654883],
        [0.00682915, 0.00654883, 0.01847673]],

       [[0.02021195, 0.00783951, 0.00784125],
        [0.00783951, 0.02006767, 0.00759631],
        [0.00784125, 0.00759631, 0.01944406]]])

In [103]:
with open('../data/kelly_hughes_2019/kelly_hughes_2019_covs_bootstrap.npy', 'wb') as f:
    np.save(f, covs_cis)

### Bootstrap the Converence Covriance

In [104]:
conv_corr = d.convergence_corr()
conv_corr

array([[[0.37148497]]])

In [None]:
with open('../data/kelly_hughes_2019/kelly_hughes_2019_conv_corr.npy', 'wb') as f:
    np.save(f, conv_corr)

In [106]:
conv_corr_cis = d.bootstrap_convergence_corr(B=5000, progress_bar=True)

HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…




In [107]:
conv_corr_cis

array([[[0.31386188]],

       [[0.35899925]],

       [[0.40013506]]])

In [108]:
with open('../data/kelly_hughes_2019/kelly_hughes_2019_conv_corr_cis.npy', 'wb') as f:
    np.save(f, conv_corr_cis)

In [109]:
gw_covs = d.calc_cov(bias_correction=True)

In [110]:
gw_covs

array([[0.01929836, 0.00717872, 0.00720038],
       [0.00717872, 0.01931026, 0.00693234],
       [0.00720038, 0.00693234, 0.01876192]])