# Table of Contents
 <p><div class="lev1"><a href="#Imports"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></div><div class="lev1"><a href="#Paths"><span class="toc-item-num">2&nbsp;&nbsp;</span>Paths</a></div><div class="lev1"><a href="#Functions"><span class="toc-item-num">3&nbsp;&nbsp;</span>Functions</a></div><div class="lev1"><a href="#Load,-recode,-and-dump-to-sqlite"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load, recode, and dump to sqlite</a></div><div class="lev2"><a href="#Add-left/right-labels-for-CHR1/CHR2"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Add left/right labels for CHR1/CHR2</a></div><div class="lev2"><a href="#Dump-to-sqlite"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Dump to sqlite</a></div><div class="lev1"><a href="#How-many-SNPs-do-we-have-per-single-scaffold-end?"><span class="toc-item-num">5&nbsp;&nbsp;</span>How many SNPs do we have per single scaffold end?</a></div><div class="lev1"><a href="#Look-at-distributions-of-R2-for-a-single-scaffold-end"><span class="toc-item-num">6&nbsp;&nbsp;</span>Look at distributions of R2 for a single scaffold end</a></div>

# Imports

In [1]:
%matplotlib inline

In [18]:
import sqlalchemy
from sqlalchemy import create_engine

import seaborn as sns

import pandas as pd

import dask.dataframe as dd

import blaze

In [3]:
sns.set_context('talk')

# Paths

In [4]:
WIN = 50000
ld_= "/home/gus/MEGAsync/zim/main/Yale/Projects/G_fuscipes/ddRAD/scaffold_organization_ld/pipeline_results/testing_development_ends_50K/interscaff_ld/scaffold_ends.indep.interchrom.geno.ld"

# Functions

In [5]:
def recode_and_add_end_labels(df, win):
    df2 = df.rename(columns={'R^2': 'R2'})
    df2 = df2.assign(CHR1_END=df2.POS1.apply(lambda i: "left" if i < win else "right"))
    df2 = df2.assign(CHR2_END=df2.POS2.apply(lambda i: "left" if i < win else "right"))
    
    return df2

# Load, recode, and dump to sqlite

In [8]:
ld = dd.read_csv(ld_, sep='\t')

In [9]:
ld.head()

Unnamed: 0,CHR1,POS1,CHR2,POS2,N_INDV,R^2
0,Scaffold0,30544,Scaffold2,22664,46,0.001151
1,Scaffold0,30544,Scaffold2,34926,44,0.001149
2,Scaffold0,30544,Scaffold2,34934,44,0.006327
3,Scaffold0,30544,Scaffold2,34945,44,0.000766
4,Scaffold0,30544,Scaffold2,34956,44,0.016272


## Add left/right labels for CHR1/CHR2

In [10]:
ld = recode_and_add_end_labels(ld,WIN)

  Before: .apply(func)
  After:  .apply(func, name=['x', 'y']) for dataframe result
  or:     .apply(func, name='x')        for series result


In [11]:
ld.head()

Unnamed: 0,CHR1,POS1,CHR2,POS2,N_INDV,R2,CHR1_END,CHR2_END
0,Scaffold0,30544,Scaffold2,22664,46,0.001151,left,left
1,Scaffold0,30544,Scaffold2,34926,44,0.001149,left,left
2,Scaffold0,30544,Scaffold2,34934,44,0.006327,left,left
3,Scaffold0,30544,Scaffold2,34945,44,0.000766,left,left
4,Scaffold0,30544,Scaffold2,34956,44,0.016272,left,left


## Dump to sqlite

In [14]:
lddf = ld.compute()

In [17]:
sqlite_db = "{base}.db".format(base=ld_)

In [15]:
ldd.to_sql()

pandas.core.frame.DataFrame

# How many SNPs do we have per single scaffold end?

In [39]:
# snps_per_end = ld.groupby(["CHR1","CHR1_END"])['POS1'].unique().apply(lambda x: len(x)).unstack()
snps_per_end = ld.groupby(["CHR1","CHR1_END"])

In [44]:
t = snps_per_end.POS1.nunique()

In [None]:
t.compute()

In [None]:
t.head()

In [None]:
sns.distplot(snps_per_end,kde=False);

# Look at distributions of R2 for a single scaffold end

In [25]:
df = ld.query(""" CHR1 == 'Scaffold0' """)

In [24]:
ld.head()

Unnamed: 0,CHR1,POS1,CHR2,POS2,N_INDV,R2,CHR1_END,CHR2_END
0,Scaffold0,30544,Scaffold2,22664,46,0.001151,left,left
1,Scaffold0,30544,Scaffold2,34926,44,0.001149,left,left
2,Scaffold0,30544,Scaffold2,34934,44,0.006327,left,left
3,Scaffold0,30544,Scaffold2,34945,44,0.000766,left,left
4,Scaffold0,30544,Scaffold2,34956,44,0.016272,left,left


In [26]:
df.head()

Unnamed: 0,CHR1,POS1,CHR2,POS2,N_INDV,R2,CHR1_END,CHR2_END
0,Scaffold0,30544,Scaffold2,22664,46,0.001151,left,left
1,Scaffold0,30544,Scaffold2,34926,44,0.001149,left,left
2,Scaffold0,30544,Scaffold2,34934,44,0.006327,left,left
3,Scaffold0,30544,Scaffold2,34945,44,0.000766,left,left
4,Scaffold0,30544,Scaffold2,34956,44,0.016272,left,left


In [21]:
df.POS1.unique()

array([13132, 31216])

In [37]:
df.groupby(["CHR1"])["POS1"].unique()

CHR1
Scaffold365    [13132, 31216]
Name: POS1, dtype: object