In [1]:
# %matplotlib inline
%matplotlib
import re

from sklearn.decomposition import PCA

import pybioclim as pbc

import geopandas as gpd

import numpy as np
import pandas as pd
import seaborn as sns
tips = sns.load_dataset("tips")

import ggplot as gp
import matplotlib.pyplot as plt

import vcf

Using matplotlib backend: TkAgg


In [2]:
# set figure characteristics

# size
sns.set_context("poster")

# Grid style
sns.set_style("whitegrid")

# colors
sns.set_palette(sns.hls_palette(n_colors=5, h=0.59, l=0.4, s=0.75))

ggplot_theme = gp.theme_seaborn(context='poster')

color_list = ["#FF0000","#0000FF","#00FF00","#000000"]

site_cmap =     {"KG": "#FF0000",
                 "OT": "#000000",
                 "MS": "#0000FF",
                 "NB": "#00FF00"}
manual_color_scale = gp.scale_color_manual(values=color_list)



In [3]:
# Paths
vcf_path = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.maf0_05.OT_MS_NB_indv.recode.vcf"

# pcadapt_dir = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/vcftools_out/ddrad58_populations/individuals/PCAdapt/results"

bioclim_dir = "/home/gus/data/ddrad/environmental/www.worldclim.org/bioclim"

pop_coords = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/manuscript/locations_gps_andrea.csv"



In [4]:
# load/define stuff

## my version of the bioclim data
pbc.DATA_DIR = bioclim_dir

## coord file
pop_data = pd.DataFrame.from_csv(pop_coords).reset_index()

## bioclims to use

bioclims = {"bio2": "Mean Diurnal Range (Mean of monthly (max temp - min temp))",
            "bio3": "Isothermality (BIO2/BIO7) (* 100)",
            "bio4": "Temperature Seasonality (standard deviation *100)",
            "bio5": "Max Temperature of Warmest Month",
            "bio6": "Min Temperature of Coldest Month",
            "bio7": "Temperature Annual Range (BIO5-BIO6)",
            "bio8": "Mean Temperature of Wettest Quarter",
            "bio9": "Mean Temperature of Driest Quarter",
            "bio12": "Annual Precipitation",
            "bio13": "Precipitation of Wettest Month",
            "bio14": "Precipitation of Driest Month",
            "bio15": "Precipitation Seasonality (Coefficient of Variation)",
            "bio18": "Precipitation of Warmest Quarter",
            "bio19": "Precipitation of Coldest Quarter"}

In [5]:
# get data we care about
def add_bioclims(df, clim_list):
    for clim in clim_list:
        df[clim] = pbc.get_values(clim,df[['lat','long']].as_matrix())
    return df

In [9]:
pop_data.head()

Unnamed: 0,code,name,lat,long,color,indivs,bio12,bio13,bio14,bio15,bio18,bio19,bio2,bio3,bio6,bio7,bio4,bio5,bio8,bio9
0,KG,Kalangala,-0.323734,32.293724,#FF0000,3,1918,302,79,45,436,306,103,83,153,124,525,277,219,208
1,OT,Otuboi,1.918258,33.302457,#000000,14,1312,194,17,51,134,447,129,79,165,163,1015,328,222,245
2,MS,Masindi,1.683327,31.734009,#0000FF,12,1330,174,33,41,186,388,119,81,166,146,795,312,223,238
3,NB,Natumba,0.836069,33.68582,#00FF00,24,1322,202,40,40,214,310,121,82,166,147,691,313,234,238


In [6]:
pop_data = add_bioclims(pop_data, bioclims.keys())

In [10]:
pop_data.to_csv(bioclim_dir + "/pop_coords_and_bioclim_data.csv",index=False)

In [25]:
pop_data[[x for x in pop_data.columns if x.startswith('bio')]]

Unnamed: 0,bio12,bio13,bio14,bio15,bio18,bio19,bio2,bio3,bio6,bio7,bio4,bio5,bio8,bio9
0,1918,302,79,45,436,306,103,83,153,124,525,277,219,208
1,1312,194,17,51,134,447,129,79,165,163,1015,328,222,245
2,1330,174,33,41,186,388,119,81,166,146,795,312,223,238
3,1322,202,40,40,214,310,121,82,166,147,691,313,234,238


In [205]:
# standardize data

clim_data = pop_data[bioclims.keys()]
clim_data.columns = [int(x[3:]) for x in clim_data.columns.values]
clim_data = clim_data.sort_index(axis=1)
clim_data.columns = ["bio"+str(x) for x in clim_data.columns.values]
clim_data_norm = (clim_data - clim_data.mean()) / (clim_data.max() - clim_data.min())
clim_data_norm_noindex = clim_data_norm.copy()

clim_data_norm["site"] = pop_data.code
clim_data_norm

Unnamed: 0,bio2,bio3,bio4,bio5,bio6,bio7,bio8,bio9,bio12,bio13,bio14,bio15,bio18,bio19,site
0,-0.576923,0.4375,-0.472449,-0.598039,-0.730769,-0.538462,-0.366667,-0.655405,0.738449,0.65625,0.592742,0.068182,0.640728,-0.402482,KG
1,0.423077,-0.5625,0.527551,0.401961,0.192308,0.461538,-0.166667,0.344595,-0.261551,-0.1875,-0.407258,0.613636,-0.359272,0.597518,OT
2,0.038462,-0.0625,0.078571,0.088235,0.269231,0.025641,-0.1,0.155405,-0.231848,-0.34375,-0.149194,-0.295455,-0.187086,0.179078,MS
3,0.115385,0.1875,-0.133673,0.107843,0.269231,0.051282,0.633333,0.155405,-0.24505,-0.125,-0.03629,-0.386364,-0.094371,-0.374113,NB


In [206]:
# make melting easier

def gather(df, key, value, cols):
    id_vars = [col for col in df.columns if col not in cols]
    id_values = cols
    var_name = key
    value_name = value
    return pd.melt(df, id_vars, id_values, var_name, value_name)

In [207]:
clim_data_norm_melted = gather(clim_data_norm, 
                    "bioclim", 
                    "Std Devs", 
                    bioclims.keys(),
                   )
clim_data_norm_melted.head()

Unnamed: 0,site,bioclim,Std Devs
0,KG,bio12,0.738449
1,OT,bio12,-0.261551
2,MS,bio12,-0.231848
3,NB,bio12,-0.24505
4,KG,bio13,0.65625


In [208]:
g = sns.FacetGrid(clim_data_norm_melted, col="bioclim", col_wrap=4, sharey=True,)
g.map(sns.pointplot, "site", "Std Devs", color=".3", ci=None);

In [209]:
g = sns.clustermap(clim_data_norm_noindex)

# PCA after normalization of bioclims

In [210]:
clim_data_norm_site_index = clim_data_norm_noindex.copy()
clim_data_norm_site_index.index = pop_data.code.values
clim_data_norm_site_index.to_clipboard
clim_data_norm_site_index

Unnamed: 0,bio2,bio3,bio4,bio5,bio6,bio7,bio8,bio9,bio12,bio13,bio14,bio15,bio18,bio19
KG,-0.576923,0.4375,-0.472449,-0.598039,-0.730769,-0.538462,-0.366667,-0.655405,0.738449,0.65625,0.592742,0.068182,0.640728,-0.402482
OT,0.423077,-0.5625,0.527551,0.401961,0.192308,0.461538,-0.166667,0.344595,-0.261551,-0.1875,-0.407258,0.613636,-0.359272,0.597518
MS,0.038462,-0.0625,0.078571,0.088235,0.269231,0.025641,-0.1,0.155405,-0.231848,-0.34375,-0.149194,-0.295455,-0.187086,0.179078
NB,0.115385,0.1875,-0.133673,0.107843,0.269231,0.051282,0.633333,0.155405,-0.24505,-0.125,-0.03629,-0.386364,-0.094371,-0.374113


In [211]:
pca = PCA(n_components=3)
clim_data_norm_pca = pca.fit(clim_data_norm_site_index)
clim_data_norm_pca

PCA(copy=True, n_components=3, whiten=False)

In [212]:
clim_data_norm_pca.explained_variance_ratio_

array([ 0.77193143,  0.19556318,  0.03250539])

In [213]:
clim_data_norm_pca_r = pd.DataFrame(clim_data_norm_pca.transform(clim_data_norm_site_index), 
                                    index=clim_data_norm_site_index.index, 
                                    columns=["Component 1","Component 2","Component 3"])
clim_data_norm_pca_r.reset_index()

Unnamed: 0,index,Component 1,Component 2,Component 3
0,KG,-2.065423,-0.361757,0.026756
1,OT,1.357381,-0.771967,0.159583
2,MS,0.495918,0.212059,-0.429272
3,NB,0.212124,0.921666,0.242933


In [214]:
# plt.figure(figsize=(12,12))
bioclim_pca_2x3 = gp.ggplot(clim_data_norm_pca_r.reset_index(), 
                            gp.aes('Component 2', 'Component 3', color="index")) 
bioclim_pca_2x3 += gp.geom_point(size=50)
bioclim_pca_2x3 += ggplot_theme
bioclim_pca_2x3 += manual_color_scale
bioclim_pca_2x3

<ggplot: (8744021369985)>

In [215]:
# plt.figure(figsize=(12,12))
bioclim_pca_1x2 = gp.ggplot(clim_data_norm_pca_r.reset_index(), 
                            gp.aes('Component 3', 'Component 2', color="index")) 
bioclim_pca_1x2 += gp.geom_point(size=50)
bioclim_pca_1x2 += ggplot_theme
bioclim_pca_1x2 += manual_color_scale
bioclim_pca_1x2

<ggplot: (8744021366453)>

In [232]:
# plt.figure(figsize=(12,12))
bioclim_pca_1x3 = gp.ggplot(clim_data_norm_pca_r.reset_index(), 
                            gp.aes('Component 1', 'Component 3', color="index")) 
bioclim_pca_1x3 += gp.geom_point(size=50)
bioclim_pca_1x3 += ggplot_theme
bioclim_pca_1x3 += manual_color_scale
bioclim_pca_1x3

<ggplot: (8744022112665)>

In [231]:
sns.barplot(x=clim_data_norm_site_index.columns.values,
            y=(clim_data_norm_pca.components_[0]), color='k'
           )
plt.ylabel("Loadings")
plt.title("Component 1")

<matplotlib.text.Text at 0x7f3e05175c50>

In [218]:
sns.barplot(x=clim_data_norm_site_index.columns.values,
            y=(clim_data_norm_pca.components_[1]), color='k'
           )
plt.ylabel("Loadings")
plt.title("Component 2")

<matplotlib.text.Text at 0x7f3e048151d0>

In [219]:
sns.barplot(x=clim_data_norm_site_index.columns.values,
            y=(clim_data_norm_pca.components_[2]), color='k'
           )
plt.ylabel("Loadings")
plt.title("Component 3")

<matplotlib.text.Text at 0x7f3e048151d0>

In [220]:
sns.barplot(x=clim_data_norm_site_index.columns.values,
            y=(clim_data_norm_pca.components_[2]), color='k'
           )

<matplotlib.axes._subplots.AxesSubplot at 0x7f3e046da2d0>

In [221]:
sns.barplot(x=clim_data_norm_site_index.columns.values,
            y=abs(clim_data_norm_pca.components_[0]), color='k'
           )

<matplotlib.axes._subplots.AxesSubplot at 0x7f3e046da2d0>

In [222]:
[x[3:] for x in clim_data_norm_site_index.columns.values]

['2', '3', '4', '5', '6', '7', '8', '9', '12', '13', '14', '15', '18', '19']