In [1]:
# %matplotlib inline
%matplotlib
import re

import numpy as np
import pandas as pd
import seaborn as sns

import ggplot as gp
import matplotlib.pyplot as plt

import vcf

Using matplotlib backend: TkAgg


In [2]:
# set figure characteristics

# size
sns.set_context("poster")

# Grid style
sns.set_style("whitegrid")

# colors
sns.set_palette(sns.hls_palette(n_colors=5, h=0.59, l=0.4, s=0.75))

ggplot_theme = gp.theme_seaborn(context='poster')

color_list = ["#0000FF","#00FF00","#000000"]

site_cmap =     {"KG": "#FF0000",
                 "OT": "#000000",
                 "MS": "#0000FF",
                 "NB": "#00FF00"}
manual_color_scale = gp.scale_color_manual(values=color_list)



In [3]:
# Paths
vcf_path = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.maf0_05.OT_MS_NB_indv.recode.vcf"

pcadapt_dir = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/PCAdapt/results"

pcadapt_snps = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.maf0_05.OT_MS_NB_indv.recode.vcfsnp"

In [4]:
def recode_column(column_name):
    
    if isinstance(column_name,int):
        i = column_name + 1
        return "Factor {i}".format(i=i)
    
    if column_name.upper() == "INDEX":
        return "sample"
    
def load_scores(pcadapt_scores_path, vcf_path):
    vcf_reader = vcf.Reader(open(vcf_path, 'r'))
    indiv_names = vcf_reader.samples
    
    df = pd.read_table(filepath_or_buffer=pcadapt_scores_path, sep='\s', 
                                             header=None, names=indiv_names,
                                             engine='python').T.reset_index()
    df = df.rename(columns=recode_column)
    
    return df

def load_main(pcadapt_path):
    
    return pd.read_table(filepath_or_buffer=pcadapt_path, sep='\t', engine='python').dropna(axis=1)

In [5]:
# load files
PCAdapt_s1000_b500_k2_scores = load_scores(pcadapt_scores_path=pcadapt_dir+"/PCAdapt_s1000_b500_k2.scores",
                                           vcf_path=vcf_path)


In [6]:
PCAdapt_s1000_b500_k2 = load_main(pcadapt_path=pcadapt_dir+"/PCAdapt_s1000_b500_k2")

# Plotting functions

In [7]:
# set labels
def set_site_names(df):
    re_site = re.compile('\A\D{,3}\d+')
    df["site"] = df['sample'].apply(lambda indiv: re_site.findall(indiv)[0])
    df.sort(columns='sample', inplace=True)
    df.reset_index(inplace=True)
    df["individuals"] = df.index.values + 1

    return df.drop(["index"], axis=1)

# plot factors vs individuals

def plot_factors_v_indivs(df):
    
    plt.figure(figsize=(20, 6))
    f = sns.stripplot(x="Factor 2", y="Factor 1", hue="site", data=df, jitter=False, split=True)
#     f = plt.scatter(x=df["Factor 1"], y=df["Factor 2"], c=df["site"],cmap=sns.cmap)
    return f
    


In [8]:
my_plots = {}

In [9]:
PCAdapt_s1000_b500_k2_scores = set_site_names(PCAdapt_s1000_b500_k2_scores)

In [10]:
PCAdapt_s1000_b500_k2_scores.sort("Factor 2", inplace=True)
PCAdapt_s1000_b500_k2_scores.head(15)

Unnamed: 0,sample,Factor 1,Factor 2,site,individuals
33,NB11_085,32.01361,-28.961327,NB11,34
31,NB11_068,33.357046,-27.841724,NB11,32
34,NB11_093,31.813607,-27.639608,NB11,35
22,NB11_033,31.824043,-27.516137,NB11,23
29,NB11_063,32.472621,-27.435258,NB11,30
32,NB11_079,31.484468,-27.380439,NB11,33
35,NB11_096,32.214006,-27.330713,NB11,36
30,NB11_067,31.705143,-27.185626,NB11,31
25,NB11_043,31.487699,-26.972859,NB11,26
13,NB11_004,31.634048,-26.930801,NB11,14


In [11]:
# PCAdapt_s1000_b500_k2_scores.head(15)

In [12]:
gp_factor1 = gp.ggplot(PCAdapt_s1000_b500_k2_scores, gp.aes('individuals', 'Factor 1', color='site')) \
    + gp.geom_point() \
    + ggplot_theme + manual_color_scale

my_plots["gp_factor1"] = gp_factor1 
gp_factor1

<ggplot: (8744729563437)>

In [13]:
gp_factor2 = gp.ggplot(PCAdapt_s1000_b500_k2_scores, gp.aes('individuals', 'Factor 2', color='site')) \
+ gp.geom_point() \
+ ggplot_theme + manual_color_scale
my_plots["gp_factor2"] = gp_factor2
gp_factor2

<ggplot: (8744727326649)>

In [14]:
# plt.figure(figsize=(12,12))
gp_factor1x2 = gp.ggplot(PCAdapt_s1000_b500_k2_scores, gp.aes('Factor 1', 'Factor 2', color='site')) 
gp_factor1x2 += gp.geom_point()
gp_factor1x2 += ggplot_theme
gp_factor1x2 += manual_color_scale
my_plots["gp_factor1x2"] = gp_factor1x2
gp_factor1x2

<ggplot: (8744726949445)>

In [15]:
def get_factors_rename_map(df):    
    column_map = {}
    for k,v in zip(df.columns, list(np.arange(0,len(df.columns))+1)):
        column_map[k] = "Factor {v}".format(v=str(v))

    return column_map

def assign_factors_to_snps(df):
    factor_ps = df.iloc[:,2:]
    factor_name_map = get_factors_rename_map(factor_ps)
    factor_ps = factor_ps.rename(columns=factor_name_map)
    
    factor_id = factor_ps.apply(lambda x: x.argmax(), axis = 1)
    
    df['Factor'] = factor_id
    df.rename(columns=factor_name_map, inplace=True)
    
    return df

def add_snp_number(df):
    df["SNP"] = df.index +1
    return df

def plot_factor_SNPs(df):
    pass
    

In [16]:
PCAdapt_s1000_b500_k2.head()

Unnamed: 0,logBF,logPO,P(Z = 1|O),P(Z = 2|O)
0,-0.879953,-3.835711,0.809018,0.190982
1,-0.951201,-3.90696,0.786001,0.213999
2,-0.913359,-3.869118,0.800658,0.199342
3,-0.913434,-3.869192,0.800617,0.199383
4,-0.92117,-3.876929,0.798722,0.201278


In [17]:
PCAdapt_s1000_b500_k2_ = PCAdapt_s1000_b500_k2.copy()
PCAdapt_s1000_b500_k2_.head()

Unnamed: 0,logBF,logPO,P(Z = 1|O),P(Z = 2|O)
0,-0.879953,-3.835711,0.809018,0.190982
1,-0.951201,-3.90696,0.786001,0.213999
2,-0.913359,-3.869118,0.800658,0.199342
3,-0.913434,-3.869192,0.800617,0.199383
4,-0.92117,-3.876929,0.798722,0.201278


In [18]:
# pdb

In [19]:
f = assign_factors_to_snps(PCAdapt_s1000_b500_k2_)

In [20]:
f.head(18)

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor
0,-0.879953,-3.835711,0.809018,0.190982,Factor 1
1,-0.951201,-3.90696,0.786001,0.213999,Factor 1
2,-0.913359,-3.869118,0.800658,0.199342,Factor 1
3,-0.913434,-3.869192,0.800617,0.199383,Factor 1
4,-0.92117,-3.876929,0.798722,0.201278,Factor 1
5,-1.322631,-4.27839,0.510217,0.489783,Factor 1
6,-1.322629,-4.278387,0.510217,0.489783,Factor 1
7,-1.228764,-4.184522,0.604798,0.395202,Factor 1
8,-1.048532,-4.00429,0.734994,0.265006,Factor 1
9,-1.322631,-4.27839,0.510219,0.489781,Factor 1


In [21]:
f = add_snp_number(f)

In [22]:
f.head()

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor,SNP
0,-0.879953,-3.835711,0.809018,0.190982,Factor 1,1
1,-0.951201,-3.90696,0.786001,0.213999,Factor 1,2
2,-0.913359,-3.869118,0.800658,0.199342,Factor 1,3
3,-0.913434,-3.869192,0.800617,0.199383,Factor 1,4
4,-0.92117,-3.876929,0.798722,0.201278,Factor 1,5


In [23]:
zones = pd.DataFrame(data={'Strength of evidence':["Worth a mention","Substantial"],
                           'logBF':[0,0.5],
                           'SNP':[75000,75000],
                          })

In [24]:
zones.head()

Unnamed: 0,SNP,Strength of evidence,logBF
0,75000,Worth a mention,0.0
1,75000,Substantial,0.5


In [25]:
aes_lines = gp.aes(#linetype=None,
                   color="Strength of evidence",
                   x='SNP',
                   y='logBF', 
                   ) 

aes_dots = gp.aes(x='SNP', 
                  y='logBF', 
                  color='Factor')

In [26]:
lines = gp.geom_line(aes_lines, 
                     size=3, 
                     data=zones)

dots = gp.geom_point(aes_dots, 
                     alpha=0.6, 
                     data=f)

In [27]:
# gp_local_adapt = gp.ggplot(aes_dots,f)
# gp_local_adapt += gp.theme_seaborn()
# my_plots["gp_local_adapt"] = gp_local_adapt
# gp_local_adapt + lines + dots + ggplot_theme

In [28]:
gp_local_adapt = gp.ggplot(f, gp.aes('SNP', 'logBF', color='Factor', xmin=0, xmax=len(f)))
gp_local_adapt += gp.geom_point(alpha=0.6)
gp_local_adapt += gp.geom_hline(yintercept=[0], show_guide=True, color='k', linetype='--', size=3)
# gp_local_adapt += gp.geom_text(gp.aes(x="x", y="y", label="LineKind"),data=zones)
gp_local_adapt += ggplot_theme
my_plots["gp_local_adapt"] = gp_local_adapt

gp_local_adapt

<ggplot: (8744729563549)>

# Identify which SNPs are the probable outliers

In [29]:
snp_outliers = f.query("logBF >= 0").sort('logBF',ascending=False)
snp_outliers

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor,SNP
22488,0.145904,-2.809855,0.022371,0.977629,Factor 2,22489
22487,0.145718,-2.810041,0.022444,0.977556,Factor 2,22488
40971,0.098694,-2.857064,0.0252,0.9748,Factor 2,40972
22278,0.05476,-2.900998,0.028083,0.971917,Factor 2,22279
39523,0.012381,-2.943377,0.030552,0.969448,Factor 2,39524
55935,0.00119,-2.954569,0.031516,0.968484,Factor 2,55936


In [30]:
snps = pd.read_table(pcadapt_snps, sep=' ', header=None)

In [31]:
snps["SNP"] = snps.index.values + 1

In [32]:
snps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,SNP
0,Scaffold0,13388,.,T,C,999,.,.,GT:PL:DP:SP:GQ,,1
1,Scaffold0,23408,.,T,G,999,.,.,GT:PL:DP:SP:GQ,,2
2,Scaffold0,23418,.,G,C,999,.,.,GT:PL:DP:SP:GQ,,3
3,Scaffold0,23421,.,T,C,999,.,.,GT:PL:DP:SP:GQ,,4
4,Scaffold0,23431,.,G,A,999,.,.,GT:PL:DP:SP:GQ,,5


In [33]:
snp_outliers_joined_location = snp_outliers.join(snps, on='SNP', how='inner', lsuffix='outlier')
snp_outliers_joined_location

Unnamed: 0,logBF,logPO,Factor 1,Factor 2,Factor,SNPoutlier,0,1,2,3,4,5,6,7,8,9,SNP
22488,0.145904,-2.809855,0.022371,0.977629,Factor 2,22489,Scaffold150,13773,.,G,A,999,.,.,GT:PL:DP:SP:GQ,,22490
22487,0.145718,-2.810041,0.022444,0.977556,Factor 2,22488,Scaffold150,13772,.,G,A,999,.,.,GT:PL:DP:SP:GQ,,22489
40971,0.098694,-2.857064,0.0252,0.9748,Factor 2,40972,Scaffold250,396214,.,G,C,999,.,.,GT:PL:DP:SP:GQ,,40973
22278,0.05476,-2.900998,0.028083,0.971917,Factor 2,22279,Scaffold144,565217,.,A,T,999,.,.,GT:PL:DP:SP:GQ,,22280
39523,0.012381,-2.943377,0.030552,0.969448,Factor 2,39524,Scaffold207,475992,.,A,C,999,.,.,GT:PL:DP:SP:GQ,,39525
55935,0.00119,-2.954569,0.031516,0.968484,Factor 2,55936,Scaffold368,310507,.,T,C,999,.,.,GT:PL:DP:SP:GQ,,55937


In [34]:
snp_outliers_joined_location[["logBF",0,1]]

Unnamed: 0,logBF,0,1
22488,0.145904,Scaffold150,13773
22487,0.145718,Scaffold150,13772
40971,0.098694,Scaffold250,396214
22278,0.05476,Scaffold144,565217
39523,0.012381,Scaffold207,475992
55935,0.00119,Scaffold368,310507


In [35]:
# create bed string 

snp_outliers_joined_location_bed = snp_outliers_joined_location[[0,1]]
snp_outliers_joined_location_bed.columns = ["Seq","end"]
snp_outliers_joined_location_bed["start"] = snp_outliers_joined_location_bed['end'] - 1
snp_outliers_joined_location_bed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Seq,end,start
22488,Scaffold150,13773,13772
22487,Scaffold150,13772,13771
40971,Scaffold250,396214,396213
22278,Scaffold144,565217,565216
39523,Scaffold207,475992,475991
55935,Scaffold368,310507,310506


In [36]:
print snp_outliers_joined_location_bed.to_string()

               Seq     end   start
22488  Scaffold150   13773   13772
22487  Scaffold150   13772   13771
40971  Scaffold250  396214  396213
22278  Scaffold144  565217  565216
39523  Scaffold207  475992  475991
55935  Scaffold368  310507  310506
