# Metrics: variation

In [1]:
%run ../../shared_setup.ipynb

The Cython magic has been move to the Cython package, hence 
`%load_ext cythonmagic` is deprecated; Please use `%load_ext Cython` instead.

Though, because I am nice, I'll still try to load it for you this time.


In [2]:
# load PASS variants for all three crosses
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, 
                         variant_filter='FILTER_PASS')

2015-04-02 21:39:10.175159 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2015-04-02 21:39:10.442290 :: filter variants: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2015-04-02 21:39:10.469674 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2015-04-02 21:39:10.816443 :: filter variants: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2015-04-02 21:39:10.839421 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2015-04-02 21:39:11.182208 :: filter variants: excluding 304 (0.9%) retaining 34471 (99.1%) of 34775 variants


## No. of independent progeny clones

In [3]:
for cross in CROSSES:
    samples = callsets[cross]['calldata'].dtype.names
    progeny = samples[2:]
    progeny_clones = set([p.split('/')[0] for p in progeny])
    print(cross, len(progeny_clones))

3d7_hb3 15
hb3_dd2 35
7g8_gb4 28


## Coverage

In [4]:
tbl_samples = (etl
    .fromtsv(os.path.join(PUBLIC_DIR, 'samples.txt'))
    .convert('coverage', lambda v: int(v[:-1]))
)
tbl_samples

0|cross,1|clone,2|sample,3|run,4|instrument,5|coverage
3d7_hb3,3D7,PG0051-C,ERR019061,Illumina Genome Analyzer II,122
3d7_hb3,C01,PG0065-C,ERR019064,Illumina Genome Analyzer II,163
3d7_hb3,C01,PG0062-C,ERR019070,Illumina Genome Analyzer II,108
3d7_hb3,C02,PG0055-C,ERR019066,Illumina Genome Analyzer II,102
3d7_hb3,C02,PG0053-C,ERR019067,Illumina Genome Analyzer II,73


In [5]:
tbl_samples.valuecounts('cross')

0|cross,1|count,2|frequency
7g8_gb4,40,0.4081632653061224
hb3_dd2,37,0.3775510204081632
3d7_hb3,21,0.2142857142857142


In [6]:
df_samples = tbl_samples.todataframe()
df_samples.groupby('cross').coverage.median()

cross
3d7_hb3    102.0
7g8_gb4    106.5
hb3_dd2    110.0
Name: coverage, dtype: float64

In [7]:
df_samples.groupby('cross').coverage.min()

cross
3d7_hb3    41
7g8_gb4    55
hb3_dd2    22
Name: coverage, dtype: int64

In [8]:
df_samples.groupby('cross').coverage.max()

cross
3d7_hb3    173
7g8_gb4    250
hb3_dd2    637
Name: coverage, dtype: int64

In [9]:
tbl_samples.aggregate('cross', [('median', 'coverage', lambda g: np.median(list(g))),
                                ('min', 'coverage', min),
                                ('max', 'coverage', max)])

0|cross,1|median,2|min,3|max
3d7_hb3,102.0,41,173
7g8_gb4,106.5,55,250
hb3_dd2,110.0,22,637


## Count SNPs and INDELs

In [10]:
def count_variants(query):
    def f(row):
        callset = filter_variants(callsets[row.cross], query=query)
        return callset['variants'].size
    return f
        

tbl_variation = (etl
    .wrap([['cross']] + [[cross] for cross in CROSSES])
    .addfield('n_snps', count_variants('is_snp'))
    .addfield('n_indels', count_variants('~is_snp'))
    .addfield('n_snps_coding', count_variants('is_snp & (CDSAnnotationID != b".")'))
    .addfield('n_snps_noncoding', count_variants('is_snp & (CDSAnnotationID == b".")'))
    .addfield('n_indels_coding', count_variants('~is_snp & (CDSAnnotationID != b".")'))
    .addfield('n_indels_noncoding', count_variants('~is_snp & (CDSAnnotationID == b".")'))
    .addfield('ratio_snp_indel_coding', lambda row: row.n_snps_coding / row.n_indels_coding)
    .addfield('ratio_snp_indel_noncoding', lambda row: row.n_snps_noncoding / row.n_indels_noncoding)
    .melt(key='cross')
    .recast(variablefield='cross', valuefield='value')
)
tbl_variation.displayall()

2015-04-02 21:39:11.324651 :: filter variants: excluding 26699 (63.4%) retaining 15388 (36.6%) of 42087 variants
2015-04-02 21:39:11.331757 :: filter variants: excluding 15388 (36.6%) retaining 26699 (63.4%) of 42087 variants
2015-04-02 21:39:11.346932 :: filter variants: excluding 33219 (78.9%) retaining 8868 (21.1%) of 42087 variants
2015-04-02 21:39:11.352374 :: filter variants: excluding 35567 (84.5%) retaining 6520 (15.5%) of 42087 variants
2015-04-02 21:39:11.357002 :: filter variants: excluding 37981 (90.2%) retaining 4106 (9.8%) of 42087 variants
2015-04-02 21:39:11.362243 :: filter variants: excluding 19494 (46.3%) retaining 22593 (53.7%) of 42087 variants
2015-04-02 21:39:11.371268 :: filter variants: excluding 21576 (59.2%) retaining 14885 (40.8%) of 36461 variants
2015-04-02 21:39:11.380708 :: filter variants: excluding 14885 (40.8%) retaining 21576 (59.2%) of 36461 variants
2015-04-02 21:39:11.396338 :: filter variants: excluding 27853 (76.4%) retaining 8608 (23.6%) of 364

0|variable,1|3d7_hb3,2|7g8_gb4,3|hb3_dd2
n_indels,26699.0,20079.0,21576.0
n_indels_coding,4106.0,3731.0,3679.0
n_indels_noncoding,22593.0,16348.0,17897.0
n_snps,15388.0,14392.0,14885.0
n_snps_coding,8868.0,8205.0,8608.0
n_snps_noncoding,6520.0,6187.0,6277.0
ratio_snp_indel_coding,2.159766195811008,2.199142321093541,2.339766240826312
ratio_snp_indel_noncoding,0.2885849599433452,0.3784560802544653,0.350729172487009


## Nucleotide and indel diversity