# Kelly and Hughes (2019) Data Processing and Analysis

## Setup

In [1]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import re
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [3]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

## Varianta Data Loading

### Load in TSV data and reshape

In [5]:
keep_seqids = ['2L', '2R', '3L', '3R', 'X']
keep_autos = ['2L', '2R', '3L', '3R']

In [18]:
rawdata = pd.read_csv('../data/kelly_hughes_2019/kelly_hughes_2019.tsv', delimiter='\t')
samples = ['A0', 'A7', 'B0', 'B7', 'C0', 'C7']
freqs = rawdata[[f"{samp}_pr" for samp in samples]].values.T
depths = rawdata[[f"{samp}_reads" for samp in samples]].values.T

In [19]:
gi = GenomicIntervals()
for row in rawdata.itertuples(index=False):
    seqid = row[0].replace('Scf_', '')
    gi.append(seqid, int(row[1]))

In [20]:
gi.infer_seqlens()

### Study Design

The design is three replicates (1, 2, 3) and two timepoints (0 and 7).

In [21]:
samples = [(1, 0), (1, 7), (2, 0), (2, 7), (3, 0), (3, 7)]

## Replicate Covariance Analysis

In [22]:
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=1e5)

In [23]:
d = TiledTemporalFreqs(tiles, freqs=freqs, depths=depths, gintervals=gi, samples=samples)

In [None]:
repl_cis = d.bootstrap_Gs(alpha=0.05, keep_seqids=keep_autos, B=1000, min_af=0.05, binomial_correction=None)