# Kelly and Hughes (2019) EDA and Covariance Analysis

## Setup

In [1]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import re
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [3]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

## Varianta Data Loading

### Load in TSV data and reshape

In [5]:
keep_seqids = ['2L', '2R', '3L', '3R', 'X']
keep_autos = ['2L', '2R', '3L', '3R']

In [6]:
rawdata = pd.read_csv('../data/kelly_hughes_2019/kelly_hughes_2019.tsv', delimiter='\t')
samples = ['A0', 'A7', 'B0', 'B7', 'C0', 'C7']
freqs = rawdata[[f"{samp}_pr" for samp in samples]].values.T
depths = rawdata[[f"{samp}_reads" for samp in samples]].values.T

In [7]:
gi = GenomicIntervals()
for row in rawdata.itertuples(index=False):
    seqid = row[0].replace('Scf_', '')
    gi.append(seqid, int(row[1]))

In [8]:
gi.infer_seqlens()

### Study Design

The design is three replicates (1, 2, 3) and two timepoints (0 and 7). There are 14 generations — the "7" comes from the seven months that the evolution experiment was run (see p. 945).

In [9]:
samples = [('A', 0), ('A', 7), ('B', 0), ('B', 7), ('C', 0), ('C', 7)]

## Replicate Covariance Analysis

In [10]:
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=1e6)

In [11]:
d = TiledTemporalFreqs(tiles, freqs=freqs, depths=depths, diploids=1000, gintervals=gi, samples=samples)

In [12]:
d.samples

[('A', 0), ('A', 7), ('B', 0), ('B', 7), ('C', 0), ('C', 7)]

In [13]:
covs_cis = d.bootstrap_cov(B=5000, progress_bar=True, average_replicates=False)

HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…




In [14]:
covs_cis

array([[[0.01768812, 0.00557436, 0.0056363 ],
        [0.00557436, 0.01767957, 0.00535109],
        [0.0056363 , 0.00535109, 0.0173561 ]],

       [[0.01901716, 0.00679566, 0.00684136],
        [0.00679566, 0.01895105, 0.00656204],
        [0.00684136, 0.00656204, 0.01848042]],

       [[0.02022767, 0.00783593, 0.00786135],
        [0.00783593, 0.02004818, 0.00758436],
        [0.00786135, 0.00758436, 0.01941738]]])

In [15]:
with open('../data/kelly_hughes_2019/kelly_hughes_2019_covs_bootstrap.npy', 'wb') as f:
    np.save(f, covs_cis)

### Bootstrap the Converence Covriance

In [17]:
d.convergence_corr()

array([[[0.37149172]]])

In [18]:
convergence_corr = d.bootstrap_convergence_corr(B=1000, progress_bar=True)

HBox(children=(IntProgress(value=0, description='bootstraps', max=1000, style=ProgressStyle(description_width=…




In [19]:
convergence_corr

array([[[0.31301616]],

       [[0.3591158 ]],

       [[0.39785909]]])

In [20]:
gw_covs = d.calc_cov()

In [21]:
gw_covs


array([[0.01929838, 0.00717876, 0.00720047],
       [0.00717876, 0.01931031, 0.00693246],
       [0.00720047, 0.00693246, 0.0187621 ]])