# Purpose:

2015-03-13 (Friday)

Explore and characterize the results of the learned Beta filter method.

# Implementation:

## Imports:

In [29]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import ggplot as gp


import numpy as np
import pandas as pd
# import tables as h5

import itertools as it
from collections import defaultdict

import numpy as np
import pandas as pd
import scipy
from scikits import bootstrap as bs
import statsmodels.api as sm
import statsmodels.stats.multitest as smm

import munch

import pymc as mc

from spartan.utils.genome_specific.GfusI1 import GfusI1_0
from spartan.utils.fastas import ParseFastA

In [3]:
# set figure characteristics

# size
sns.set_context("talk")

# style
sns.set_style("whitegrid")

## File paths:

In [34]:
# define paths to files

fasta_path = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa"
contig_name_length_path = "/home/gus/Dropbox/uganda_data/data_repos/genome_info/assembly_info/contig_name_length.csv"

ld_results_pickle="/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/ld_thresholds/post_MAP_calc.plk"
tajimas_csv = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/data_from_andrea/Tajima50.csv"

# Record some info about the contigs once and for all

In [30]:
# contig_name_map = GfusI1_0.get_name_map_from_fasta_headers(fasta_path)

In [33]:
# contig_lengths = {}
# contigs = ParseFastA(fasta_path).to_dict()
# for name,seq in contigs.iteritems():
#     contig_lengths[name] = len(seq)

In [35]:
# # record both names and the length

# with open(contig_name_length_path,'w') as out:
#     template = "{kk_name},{scaf_name},{length}\n"
#     out.write(template.format(kk_name="kk_name",scaf_name="scaf_name",length="length"))
    
#     for kk_name, length in contig_lengths.iteritems():
#         out.write(template.format(kk_name=kk_name,
#                                   scaf_name=contig_name_map[kk_name],
#                                   length=length))
    

# Helper functions

In [41]:
def plot_bin_dists(df, bin_def="distance_bin <= 500"):
    plt.rcParams['figure.figsize'] = np.array([16, 12]) * 0.65

    p = gp.ggplot(gp.aes(x='R2'), data=df.query(bin_def))
    p = p + gp.geom_histogram(fill='coral') + \
        gp.facet_wrap("distance_bin") + \
        gp.theme_seaborn(context='talk') + gp.ggtitle(bin_def)
    
    return p

###########################

def recode_taj_chrom(df):
    recode_func = lambda x: x.split(':')[-1]

    CHROM = df.CHROM.apply(recode_func)
    df.CHROM = CHROM

# Data exploration

In [37]:
# load our results tables
ld = pd.read_pickle(ld_results_pickle)
ld.head()

Unnamed: 0,CHR_A,BP_A,SNP_A,MAF_A,CHR_B,BP_B,SNP_B,MAF_B,R,DP,...,distance_bin,distance_bin_mean_R2,distance_bin_median_R2,R2_scaled_for_B,alpha_param,beta_param,cdf,one_minus_cdf,one_minus_cdf_BH,MAP_succeeded
0,Scaffold0,13388,.,0.278846,Scaffold0,23408,.,0.298077,1.0,1,...,10000,0.11435,0.035591,0.9995,,,1.0,0.0,1.7e-05,False
1,Scaffold0,13388,.,0.278846,Scaffold0,23418,.,0.298077,1.0,1,...,10000,0.11435,0.035591,0.9995,,,1.0,0.0,1.7e-05,False
2,Scaffold0,13388,.,0.278846,Scaffold0,23421,.,0.298077,1.0,1,...,10000,0.11435,0.035591,0.9995,,,1.0,0.0,1.7e-05,False
3,Scaffold0,13388,.,0.278846,Scaffold0,23431,.,0.288462,0.975946,1,...,10000,0.11435,0.035591,0.952018,,,0.999063,0.000937,0.073542,False
4,Scaffold0,13388,.,0.278846,Scaffold0,30544,.,0.09434,0.464238,1,...,17150,0.101013,0.041075,0.215801,,,0.745361,0.254639,0.912914,False


In [36]:
contig_info = pd.read_csv(contig_name_length_path)
contig_info.head()

Unnamed: 0,kk_name,scaf_name,length
0,KK352346.1,Scaffold566,193315
1,KK352610.1,Scaffold839,83110
2,KK352241.1,Scaffold458,243873
3,JFJR01012964.1,JFJR01012964.1,3083
4,KK352052.1,Scaffold268,427914


In [38]:
taj50 = pd.read_csv(tajimas_csv, sep='\t')
taj50.head()

Unnamed: 0,CHROM,BIN_start,N_SNPs,TajimaD
202,GFvariants_VB2014a_tvcf:KK351785.1,23400,4,2.35852
8242,GFvariants_VB2014a_tvcf:KK351785.1,425400,3,2.31422
66484,GFvariants_VB2014a_tvcf:KK351787.1,68900,2,2.20911
67161,GFvariants_VB2014a_tvcf:KK351787.1,102750,4,2.90971
67278,GFvariants_VB2014a_tvcf:KK351787.1,108600,2,2.08884


In [42]:
recode_taj_chrom(taj50)
taj50.head()

Unnamed: 0,CHROM,BIN_start,N_SNPs,TajimaD
202,KK351785.1,23400,4,2.35852
8242,KK351785.1,425400,3,2.31422
66484,KK351787.1,68900,2,2.20911
67161,KK351787.1,102750,4,2.90971
67278,KK351787.1,108600,2,2.08884


## SNP-pairs in all bins at BH corrected $p \le 0.01$

In [7]:
sum(ld.one_minus_cdf_BH <= 0.01)

5284

## SNP-pairs in all bins at BH corrected $p \le 0.05$

In [8]:
sum(ld.one_minus_cdf_BH <= 0.05)

6735

## Lowest $r^2$ retained at  $p \le 0.05$ or $0.01$

In [43]:
q_05 = ld.query("one_minus_cdf_BH <= 0.05")
q_01.R2.min()

0.82251114177599993

In [11]:
q_01 = ld.query("one_minus_cdf_BH <= 0.01")
q_01.R2.min()

0.82251114177599993

## How many SNP-pairs have  $r^2 \ge 0.82$?

In [12]:
sum(ld.R2 >= 0.82)

26495

In [13]:
1-(5284.0/26495)

0.8005661445555765

## Characterization of contigs with/without regard to selected SNP-pairs

In [45]:
# join contig length and kk_name contig info to the LD table
ld_contig = pd.merge(left=ld, right=contig_info, how='inner', left_on="CHR_A", right_on="scaf_name")
ld_contig.head()

Unnamed: 0,CHR_A,BP_A,SNP_A,MAF_A,CHR_B,BP_B,SNP_B,MAF_B,R,DP,...,R2_scaled_for_B,alpha_param,beta_param,cdf,one_minus_cdf,one_minus_cdf_BH,MAP_succeeded,kk_name,scaf_name,length
0,Scaffold0,13388,.,0.278846,Scaffold0,23408,.,0.298077,1.0,1,...,0.9995,,,1.0,0.0,1.7e-05,False,KK351785.1,Scaffold0,3329503
1,Scaffold0,13388,.,0.278846,Scaffold0,23418,.,0.298077,1.0,1,...,0.9995,,,1.0,0.0,1.7e-05,False,KK351785.1,Scaffold0,3329503
2,Scaffold0,13388,.,0.278846,Scaffold0,23421,.,0.298077,1.0,1,...,0.9995,,,1.0,0.0,1.7e-05,False,KK351785.1,Scaffold0,3329503
3,Scaffold0,13388,.,0.278846,Scaffold0,23431,.,0.288462,0.975946,1,...,0.952018,,,0.999063,0.000937,0.073542,False,KK351785.1,Scaffold0,3329503
4,Scaffold0,13388,.,0.278846,Scaffold0,30544,.,0.09434,0.464238,1,...,0.215801,,,0.745361,0.254639,0.912914,False,KK351785.1,Scaffold0,3329503


### All contigs

#### Without regard to selected SNP-pairs

In [49]:
len(ld_contig.scaf_name.unique())

1276

In [None]:
number_wo = ld_contig.scaf_name.unique()