# Castro et al. (2019) Data Processing and Analysis

## Setup

In [1]:
import os
import sys
import glob
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import re
import pickle
from functools import reduce

import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl

In [3]:
from cvtk.cvtk import TemporalFreqs, TiledTemporalFreqs
from cvtk.cov import stack_temporal_covariances
import cvtk.variant_files as vf
from cvtk.gintervals import GenomicIntervals
from cvtk.pca import FreqPCA
from cvtk.plots import rep_plot_pca, correction_diagnostic_plot
from cvtk.utils import integerize
from cvtk.utils import extract_empirical_nulls_diagonals, extract_temporal_cov_diagonals
from cvtk.cov import stack_replicate_covariances, stack_temporal_covs_by_group
from cvtk.variant_files import VCFFile

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.figsize'] = (8.0, 4.0)
mpl.rcParams['figure.dpi'] = 200

# Varianta Data Loading

### Load in VCF data

In [31]:
vcf = VCFFile('../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz')

reading file '../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz'...
file '../data/castro_et_al_2019/beagle_genMap.all.impute.vcf.gz' loaded.
total time to load VCF file: 9.34765721956889 mins.


### Load in variant data

We process the raw `.frq` files, which contain all sites whether segregating in the sample, or not. This makes a row-wise join easy. 

The following list is our design for this experiment:

In [5]:
design = [('Ctrl', 0), ('Ctrl', 17), ('LS1', 0), ('LS1', 17), ('LS2', 0), ('LS2', 17)]

In [6]:
CACHED_FREQS = '../data/castro_et_al_2019/castro_et_al_2019_frequencies.pkl'
CACHED_POS = '../data/castro_et_al_2019/castro_et_al_2019_loci.pkl'
if not os.path.exists(CACHED_FREQS):
    print("processing frequency matrix from raw data...")
    frq_files = glob.glob('../data/castro_et_al_2019/*frq')
    all_data = dict()
    all_metadata = []

    first = True
    for frq_file in frq_files:
        m = re.match(r'beagle_genMap\.all\.impute\.(?P<line>\w+)_F(?P<gen>\w+).frq', os.path.basename(frq_file))
        metadata = m.groupdict()
        all_metadata.append(metadata)
        d = pd.read_csv(frq_file, delimiter='\t', index_col=False)
        d.columns = ['chrom', 'pos', 'nalleles', 'nchr', 'freq']
        # assert the data has the exact same loci in all cases
        if first:
            loci = set([(chrom, pos) for chrom, pos in zip(d['chrom'], d['pos'])])
        else:
            assert(loci == set([(chrom, pos) for chrom, pos in zip(d['chrom'], d['pos'])]))
        all_data[(metadata['line'], int(metadata['gen']))] = d
    
    # combine all frequencies into a matrix
    total_freq_mat = np.array([all_data[key]['freq'].values for key in design])
    
    # filter out non-segregating sites
    fixed_sites = np.all((total_freq_mat == 0) | (total_freq_mat == 1), axis=0)
    freq_mat = total_freq_mat[:, ~fixed_sites]
    seg_indices = set(list(np.argwhere(~fixed_sites).ravel()))
    
    # cache the frequency matrix
    print("saving frequency matrix cached pickle file...")
    with open(CACHED_FREQS, 'wb') as f:
        pickle.dump(freq_mat, f)
        
    print("rebuilding loci GenomicIntervals...")
    gi = GenomicIntervals()
    for i, row in enumerate(all_data[('Ctrl', 0)].itertuples(index=False)):
        if i not in seg_indices:
            # We only keep segregating sites, which are filtered out once the frequency
            # matrix is parsed. We don't include loci that aren't in the freq matrix.
            continue
        chrom, pos = row[0:2]
        gi.append(chrom, int(pos))
    print("caching loci GenomicIntervals...")
    gi.dump(CACHED_POS)
else:
    # load the existing cached matrix
    print("loading frequency matrix from cached pickle file...")
    with open(CACHED_FREQS, 'rb') as f:
        freq_mat = pickle.load(f)

loading frequency matrix from cached pickle file...


In [7]:
# try to load loci file
if not os.path.exists(CACHED_POS):
    raise ValueError("cached loci file not found — regenerate frequency matrix and loci GenomicIntervals object.")
else:
    gi = GenomicIntervals.load(CACHED_POS)

### Alternate Data Source

The directory `http://ftp.tuebingen.mpg.de/fml/ag-chan/Longshanks/` also has file http://ftp.tuebingen.mpg.de/fml/ag-chan/Longshanks/CtrlLS1LS2_F0F17.RefAlt_ReadCounts.by_chr.tar.gz which we process here.

These have been pre-processed (separate chromosome files combined), and we load them in here.

In [8]:
def process_file(file):
    name = os.path.basename(file)[:-4]
    table = pd.read_csv(file, delimiter='\t', header=None)
    chrom = table.loc[:, 0].values
    pos = table.loc[:, 1].values
    
    m = table.loc[:, 4:].values
    #geno_mat = np.stack((m[:, np.arange(0, m.shape[1]) % 2 == 0], m[:, np.arange(0, m.shape[1]) % 2 == 1]), axis=2)
    m = m.astype('uint8')
    m = np.stack((m[:, np.arange(0, m.shape[1]) % 2 == 0], m[:, np.arange(0, m.shape[1]) % 2 == 1]), axis=2)
    
    del table
    
    counts = np.nansum(m[:, :, 0], axis=1)
    depths = np.nansum(m[:, :, 0] + m[:, :, 1], axis=1)
    
    del m
    out = pd.DataFrame(dict(chrom=chrom,
                            pos=pos,
                            #ref=table.loc[:, 2],
                            #alt=table.loc[:, 3],
                            counts=counts, 
                            depths=depths))
    
    out.rename(columns={'counts': 'counts_' + name}, inplace=True)
    out.rename(columns={'depths': 'depths_' + name}, inplace=True)
    return out

Now we combine load all the files (or use the pre-combined one).

In [9]:
#COMBINED_CSV = '../data/castro_et_al_2019/combined_counts.csv'
COMBINED_FEATHER = '../data/castro_et_al_2019/combined_counts.feather'
samples = ['F17_Ctrl', 'F17_LS1', 'F17_LS2', 'P0_Ctrl', 'P0_LS1', 'P0_LS2']
FORCE = False

if FORCE or not os.path.exists(COMBINED_FEATHER):
    print("no combined feather file found, generating one...")
    dfs = dict()
    for sample in samples:
        dfs[sample] = process_file(os.path.join('../data/castro_et_al_2019/chrom_depth_counts/', sample + '.tsv'))

    df = reduce(lambda left, right: pd.merge(left, right, on=('chrom', 'pos'), how='outer'), dfs.values())
    df = df.drop_duplicates().reset_index()
    df.to_feather(COMBINED_FEATHER)
    #df.to_csv(COMBINED_CSV, index=False)
    
    #del dfs, d_P0_Ctrl, d_P0_LS1, d_P0_LS2, d_F17_Ctrl, d_F17_LS1, d_F17_LS2
else:
    print("combined feather file found, loading...")
    df = pd.read_feather(COMBINED_FEATHER)

combined feather file found, loading...


In [5]:
DATA_DIR = '../data/castro_et_al_2019/chrom_depth_counts'
files = os.listdir(DATA_DIR)

FILE_REGEX = r"(?P<gen>[^_]+)_(?P<line>[^_]+)\.bcf_.*"

tables = defaultdict(dict)
for file in files:
    if file == 'i_j.bcf_mpileup.RefAlt_AD.chr1.summary':
        # odd file that's not part of sample
        continue
    metadata = re.match(FILE_REGEX, file).groupdict()
    table = pd.read_csv(os.path.join(DATA_DIR, file), delimiter='\t', header=None)
    counts = table.loc[:, 4:].values
    
    m = table.loc[:, 4:].values
    #geno_mat = np.stack((m[:, np.arange(0, m.shape[1]) % 2 == 0], m[:, np.arange(0, m.shape[1]) % 2 == 1]), axis=2)
    m = m.astype('uint8')
    m = np.stack((m[:, np.arange(0, m.shape[1]) % 2 == 0], m[:, np.arange(0, m.shape[1]) % 2 == 1]), axis=2)
    
    chrom = table.loc[:, 0].values
    chrom_name = table.loc[:, 0].unique()[0]
    pos = table.loc[:, 1].values
    ref =  table.loc[:, 2].values
    alt =  table.loc[:, 3].values
    sample_key = (metadata['line'], int(metadata['gen'][1:]))
    tables[chrom_name][sample_key] = (pos, ref, alt, geno_mat)    

In [50]:
def sort_chrs(x):
    chr = x.replace('chr', '')
    if chr == 'X':
        return 1e6
    return int(chr)

In [53]:
pos = [tables[chr][0] for chr in sorted(tables.keys(), key=sort_chrs)]
chrom = [chr for _ in tables[chr][0].values for chr in sorted(tables.keys(), key=sort_chrs)]
ref = [tables[chr][1] for chr in sorted(tables.keys(), key=sort_chrs)]
alt = [tables[chr][2] for chr in sorted(tables.keys(), key=sort_chrs)]
geno_mat = [tables[chr][3] for chr in sorted(tables.keys(), key=sort_chrs)]

KeyError: 0

In [14]:
autosomes = list(set(gi.intervals.keys()) - set('chrX'))

In [15]:
tile_width = 1e5
gi.infer_seqlens()
tiles = GenomicIntervals.from_tiles(gi.seqlens, width=tile_width)

In [16]:
d = TiledTemporalFreqs(tiles, freqs=freq_mat, samples=design, gintervals=gi)

AttributeError: 'NoneType' object has no attribute 'shape'