# Castro et al. (2019) Data Processing and Analysis

## Setup

In [7]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import re
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [17]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [10]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

# Varianta Data Loading

### Load in VCF data

In [18]:
vcf = VCFFile('../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz')

reading file '../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz'...
file '../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz' loaded.
total time to load VCF file: 9.025190444787343 mins.


### Load in variant data

We process the raw `.frq` files, which contain all sites whether segregating in the sample, or not. This makes a row-wise join easy. 

The following list is our design for this experiment:

In [11]:
design = [('Ctrl', 0), ('Ctrl', 17), ('LS1', 0), ('LS1', 17), ('LS2', 0), ('LS2', 17)]

In [12]:
CACHED_FREQS = '../data/castro_et_al_2019/castro_et_al_2019_frequencies.pkl'
CACHED_POS = '../data/castro_et_al_2019/castro_et_al_2019_loci.pkl'
if not os.path.exists(CACHED_FREQS):
    print("processing frequency matrix from raw data...")
    frq_files = glob.glob('../data/castro_et_al_2019/*frq')
    all_data = dict()
    all_metadata = []

    first = True
    for frq_file in frq_files:
        m = re.match(r'beagle_genMap\.all\.impute\.(?P<line>\w+)_F(?P<gen>\w+).frq', os.path.basename(frq_file))
        metadata = m.groupdict()
        all_metadata.append(metadata)
        d = pd.read_csv(frq_file, delimiter='\t', index_col=False)
        d.columns = ['chrom', 'pos', 'nalleles', 'nchr', 'freq']
        # assert the data has the exact same loci in all cases
        if first:
            loci = set([(chrom, pos) for chrom, pos in zip(d['chrom'], d['pos'])])
        else:
            assert(loci == set([(chrom, pos) for chrom, pos in zip(d['chrom'], d['pos'])]))
        all_data[(metadata['line'], int(metadata['gen']))] = d
    
    # combine all frequencies into a matrix
    total_freq_mat = np.array([all_data[key]['freq'].values for key in design])
    
    # filter out non-segregating sites
    fixed_sites = np.all((total_freq_mat == 0) | (total_freq_mat == 1), axis=0)
    freq_mat = total_freq_mat[:, ~fixed_sites]
    seg_indices = set(list(np.argwhere(~fixed_sites).ravel()))
    
    # cache the frequency matrix
    print("saving frequency matrix cached pickle file...")
    with open(CACHED_FREQS, 'wb') as f:
        pickle.dump(freq_mat, f)
        
    print("rebuilding loci GenomicIntervals...")
    gi = GenomicIntervals()
    for i, row in enumerate(all_data[('Ctrl', 0)].itertuples(index=False)):
        if i not in seg_indices:
            # We only keep segregating sites, which are filtered out once the frequency
            # matrix is parsed. We don't include loci that aren't in the freq matrix.
            continue
        chrom, pos = row[0:2]
        gi.append(chrom, int(pos))
    print("caching loci GenomicIntervals...")
    gi.dump(CACHED_POS)
else:
    # load the existing cached matrix
    print("loading frequency matrix from cached pickle file...")
    with open(CACHED_FREQS, 'rb') as f:
        freq_mat = pickle.load(f)

processing frequency matrix from raw data...
saving frequency matrix cached pickle file...
rebuilding loci GenomicIntervals...
caching loci GenomicIntervals...


In [13]:
# try to load loci file
if not os.path.exists(CACHED_POS):
    raise ValueError("cached loci file not found — regenerate frequency matrix and loci GenomicIntervals object.")
else:
    gi = GenomicIntervals.load(CACHED_POS)

In [14]:
autosomes = list(set(gi.intervals.keys()) - set('chrX'))

In [15]:
tile_width = 1e5
gi.infer_seqlens()
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=tile_width)

In [16]:
d = TiledTemporalFreqs(tiles, freqs=freq_mat, samples=design, gintervals=gi)

AttributeError: 'NoneType' object has no attribute 'shape'