In [1]:
%matplotlib
import re

import numpy as np
import pandas as pd
import seaborn as sns
tips = sns.load_dataset("tips")

import ggplot as gp
import matplotlib.pyplot as plt

import vcf

Using matplotlib backend: TkAgg


In [2]:
# set figure characteristics

# size
sns.set_context("poster")

# Grid style
sns.set_style("whitegrid")

# colors
sns.set_palette(sns.hls_palette(n_colors=5, h=0.59, l=0.4, s=0.75))

ggplot_theme = gp.theme_seaborn(context='poster')


In [3]:
# Paths
vcf_path = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.maf0_05.OT_MS_NB_indv.recode.vcf"

pcadapt_dir = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/PCAdapt/results"

pcadapt_snps = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.maf0_05.OT_MS_NB_indv.recode.vcfsnp"

In [4]:
def gather(df, key, value, cols):
    id_vars = [col for col in df.columns if col not in cols]
    id_values = cols
    var_name = key
    value_name = value
    return pd.melt(df, id_vars, id_values, var_name, value_name)

def recode_column(column_name):
    
    if isinstance(column_name,int):
        i = column_name + 1
        return "Factor {i}".format(i=i)
    
    if column_name.upper() == "INDEX":
        return "sample"
    
def load_scores(pcadapt_scores_path, vcf_path):
    vcf_reader = vcf.Reader(open(vcf_path, 'r'))
    indiv_names = vcf_reader.samples
    
    df = pd.read_table(filepath_or_buffer=pcadapt_scores_path, sep='\s', 
                                             header=None, names=indiv_names,
                                             engine='python').T.reset_index()
    df = df.rename(columns=recode_column)
    
    return df

def load_main(pcadapt_path):
    
    return pd.read_table(filepath_or_buffer=pcadapt_path, sep='\t', engine='python').dropna(axis=1)

In [5]:
# load files
PCAdapt_s1000_b500_k2_scores = load_scores(pcadapt_scores_path=pcadapt_dir+"/PCAdapt_s1000_b500_k2.scores",
                                           vcf_path=vcf_path)

In [6]:
PCAdapt_s1000_b500_k2 = load_main(pcadapt_path=pcadapt_dir+"/PCAdapt_s1000_b500_k2")

# Plotting functions

In [7]:
# set labels

def set_site_names(df):
    re_site = re.compile('\A\D{,3}')
    df["site"] = df['sample'].apply(lambda indiv: re_site.findall(indiv)[0])
    df.sort(columns='sample', inplace=True)
    df.reset_index(inplace=True)
    df["individuals"] = df.index.values + 1
    
    
    return df.drop(["index"], axis=1)
#     return df

# plot factors vs individuals

def plot_factors_v_indivs(df):
    
    plt.figure(figsize=(20, 6))
    f = sns.stripplot(x="Factor 2", y="Factor 1", hue="site", data=df, jitter=False, split=True)
#     f = plt.scatter(x=df["Factor 1"], y=df["Factor 2"], c=df["site"],cmap=sns.cmap)
    return f
    


In [8]:
my_plots = {}

In [9]:
PCAdapt_s1000_b500_k2_scores = set_site_names(PCAdapt_s1000_b500_k2_scores)

In [10]:
gp_factor1 = gp.ggplot(PCAdapt_s1000_b500_k2_scores, gp.aes('individuals', 'Factor 1', color='site')) + gp.geom_point() + ggplot_theme
my_plots["gp_factor1"] = gp_factor1
gp_factor1

<ggplot: (8793545112601)>

In [11]:
gp_factor2 = gp.ggplot(PCAdapt_s1000_b500_k2_scores, gp.aes('individuals', 'Factor 2', color='site')) + gp.geom_point() + ggplot_theme
my_plots["gp_factor2"] = gp_factor2
gp_factor2

<ggplot: (8793545112597)>

In [12]:
# plt.figure(figsize=(12,12))
gp_factor1x2 = gp.ggplot(PCAdapt_s1000_b500_k2_scores, gp.aes('Factor 1', 'Factor 2', color='site')) 
gp_factor1x2 += gp.geom_point()
gp_factor1x2 += ggplot_theme
my_plots["gp_factor1x2"] = gp_factor1x2
gp_factor1x2

<ggplot: (8793545103009)>

In [13]:
def get_factors_rename_map(df):    
    column_map = {}
    for k,v in zip(df.columns, list(np.arange(0,len(df.columns))+1)):
        column_map[k] = "Factor {v}".format(v=str(v))

    return column_map

def assign_factors_to_snps(df):
    factor_ps = df.iloc[:,2:]
    factor_name_map = get_factors_rename_map(factor_ps)
    factor_ps = factor_ps.rename(columns=factor_name_map)
    
    factor_id = factor_ps.apply(lambda x: x.argmax(), axis = 1)
    
    df['Factor'] = factor_id
    df.rename(columns=factor_name_map, inplace=True)
    
    return df

def add_snp_number(df):
    df["SNP"] = df.index +1
    return df

def plot_factor_SNPs(df):
    pass
    

In [14]:
PCAdapt_s1000_b500_k2.head()

Unnamed: 0,logBF,logPO,P(Z = 1|O),P(Z = 2|O)
0,-0.879953,-3.835711,0.809018,0.190982
1,-0.951201,-3.90696,0.786001,0.213999
2,-0.913359,-3.869118,0.800658,0.199342
3,-0.913434,-3.869192,0.800617,0.199383
4,-0.92117,-3.876929,0.798722,0.201278


In [15]:
PCAdapt_s1000_b500_k2_ = PCAdapt_s1000_b500_k2.copy()
PCAdapt_s1000_b500_k2_.head()

Unnamed: 0,logBF,logPO,P(Z = 1|O),P(Z = 2|O)
0,-0.879953,-3.835711,0.809018,0.190982
1,-0.951201,-3.90696,0.786001,0.213999
2,-0.913359,-3.869118,0.800658,0.199342
3,-0.913434,-3.869192,0.800617,0.199383
4,-0.92117,-3.876929,0.798722,0.201278


In [16]:
# pdb

In [17]:
f = assign_factors_to_snps(PCAdapt_s1000_b500_k2_)

In [18]:
f.head(18)

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor
0,-0.879953,-3.835711,0.809018,0.190982,Factor 1
1,-0.951201,-3.90696,0.786001,0.213999,Factor 1
2,-0.913359,-3.869118,0.800658,0.199342,Factor 1
3,-0.913434,-3.869192,0.800617,0.199383,Factor 1
4,-0.92117,-3.876929,0.798722,0.201278,Factor 1
5,-1.322631,-4.27839,0.510217,0.489783,Factor 1
6,-1.322629,-4.278387,0.510217,0.489783,Factor 1
7,-1.228764,-4.184522,0.604798,0.395202,Factor 1
8,-1.048532,-4.00429,0.734994,0.265006,Factor 1
9,-1.322631,-4.27839,0.510219,0.489781,Factor 1


In [19]:
f = add_snp_number(f)

In [20]:
f.head()

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor,SNP
0,-0.879953,-3.835711,0.809018,0.190982,Factor 1,1
1,-0.951201,-3.90696,0.786001,0.213999,Factor 1,2
2,-0.913359,-3.869118,0.800658,0.199342,Factor 1,3
3,-0.913434,-3.869192,0.800617,0.199383,Factor 1,4
4,-0.92117,-3.876929,0.798722,0.201278,Factor 1,5


In [21]:
zones = pd.DataFrame(data={'Strength of evidence':["Worth a mention","Substantial"],
                           'logBF':[0,0.5],
                           'SNP':[75000,75000],
                          })

In [22]:
zones.head()

Unnamed: 0,SNP,Strength of evidence,logBF
0,75000,Worth a mention,0.0
1,75000,Substantial,0.5


In [23]:
aes_lines = gp.aes(#linetype=None,
                   color="Strength of evidence",
                   x='SNP',
                   y='logBF', 
                   ) 

aes_dots = gp.aes(x='SNP', 
                  y='logBF', 
                  color='Factor')

In [24]:
lines = gp.geom_line(aes_lines, 
                     size=3, 
                     data=zones)

dots = gp.geom_point(aes_dots, 
                     alpha=0.6, 
                     data=f)

In [25]:
gp_local_adapt = gp.ggplot(aes_dots,f)
gp_local_adapt += gp.theme_seaborn()
my_plots["gp_local_adapt"] = gp_local_adapt
gp_local_adapt + lines + dots + ggplot_theme

<ggplot: (8793542320137)>

In [33]:
gp_local_adapt = gp.ggplot(f, gp.aes('SNP', 'logBF', color='Factor', xmin=0, xmax=len(f)))
gp_local_adapt += gp.geom_point(alpha=0.6)
gp_local_adapt += gp.geom_hline(yintercept=[0], show_guide=True, color='k', linetype='--', size=3)
# gp_local_adapt += gp.geom_text(gp.aes(x="x", y="y", label="LineKind"),data=zones)
gp_local_adapt += ggplot_theme
my_plots["gp_local_adapt"] = gp_local_adapt

gp_local_adapt

<ggplot: (8793542307649)>

In [37]:
f.query("logBF >= 0").sort('logBF')

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor,SNP
55935,0.00119,-2.954569,0.031516,0.968484,Factor 2,55936
39523,0.012381,-2.943377,0.030552,0.969448,Factor 2,39524
22278,0.05476,-2.900998,0.028083,0.971917,Factor 2,22279
40971,0.098694,-2.857064,0.0252,0.9748,Factor 2,40972
22487,0.145718,-2.810041,0.022444,0.977556,Factor 2,22488
22488,0.145904,-2.809855,0.022371,0.977629,Factor 2,22489
