# Kelly and Hughes (2019) EDA and Covariance Analysis

## Setup

In [5]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [6]:
import re
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [3]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

## Varianta Data Loading

### Load in TSV data and reshape

In [5]:
keep_seqids = ['2L', '2R', '3L', '3R', 'X']
keep_autos = ['2L', '2R', '3L', '3R']

In [6]:
rawdata = pd.read_csv('../data/kelly_hughes_2019/kelly_hughes_2019.tsv', delimiter='\t')
samples = ['A0', 'A7', 'B0', 'B7', 'C0', 'C7']
freqs = rawdata[[f"{samp}_pr" for samp in samples]].values.T
depths = rawdata[[f"{samp}_reads" for samp in samples]].values.T

In [7]:
gi = GenomicIntervals()
for row in rawdata.itertuples(index=False):
    seqid = row[0].replace('Scf_', '')
    gi.append(seqid, int(row[1]))

In [8]:
gi.infer_seqlens()

### Study Design

The design is three replicates (1, 2, 3) and two timepoints (0 and 7). There are 14 generations — the "7" comes from the seven months that the evolution experiment was run (see p. 945).

In [12]:
samples = [('A', 0), ('A', 7), ('B', 0), ('B', 7), ('C', 0), ('C', 7)]

## Replicate Covariance Analysis

In [14]:
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=1e6)

In [15]:
d = TiledTemporalFreqs(tiles, freqs=freqs, depths=depths, diploids=1000, gintervals=gi, samples=samples)

In [21]:
d.samples

[('A', 0), ('A', 7), ('B', 0), ('B', 7), ('C', 0), ('C', 7)]

In [18]:
covs_cis = d.bootstrap_covs(B=5000, progress_bar=True)

HBox(children=(IntProgress(value=0, description='bootstraps', max=5000, style=ProgressStyle(description_width=…




In [19]:
covs_cis

array([[[0.01916367, 0.00568883, 0.00572219],
        [0.00568883, 0.01910373, 0.00543218],
        [0.00572219, 0.00543218, 0.01878798]],

       [[0.02063743, 0.00715719, 0.00718959],
        [0.00715719, 0.02056062, 0.00692392],
        [0.00718959, 0.00692392, 0.0200592 ]],

       [[0.0219689 , 0.00844033, 0.00845704],
        [0.00844033, 0.02187047, 0.00821325],
        [0.00845704, 0.00821325, 0.02116931]]])

In [20]:
with open('../data/kelly_hughes_2019/kelly_hughes_2019_covs_bootstrap.npy', 'wb') as f:
    np.save(f, covs_cis)

## Variance in Frequencies Explained by Replicates