## Imports:

In [70]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import ggplot as gp


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 60)
# import tables as h5

import itertools as it
from collections import defaultdict

import numpy as np
import pandas as pd

xls_writer = pd.ExcelWriter


import scipy
from scikits import bootstrap as bs
import statsmodels.api as sm
import statsmodels.stats.multitest as smm

import munch

import pymc as mc

from spartan.utils.genome_specific.GfusI1 import GfusI1_0
from spartan.utils.fastas import ParseFastA


## File paths:

In [98]:
# define paths to files

base_out_dir = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/manuscript/rename_contig_in_files"


contig_name_length_path = "/home/gus/Dropbox/uganda_data/data_repos/genome_info/assembly_info/contig_name_length.csv"

TajD_only_nonzero_path = base_out_dir + "/TajD_only_nonzero.txt"
TableS1_Top10_summary_path = base_out_dir + "/TableS1_Top10_summary.xlsx"
FileS2_Env_outlierLoci_path = base_out_dir + "/FileS2_Env_outlierLoci.xlsx"


In [99]:
# HELPER FUNCTIONS

def recode_taj_chrom(df, name_map):
    recode_func1 = lambda x: x.split(':')[-1]
    recode_func2 = lambda x: name_map[x]
    
    CHROM = df.CHROM.apply(recode_func1)
    df.CHROM = CHROM
    
    CHROM = df.CHROM.apply(recode_func2)
    df.CHROM = CHROM
    
    return df

def recode_Top10_summary_Scaffold(df, name_map):
    recode_func1 = lambda x: name_map[x]
    
    Scaffold = df.Scaffold.apply(recode_func1)
    df.Scaffold = Scaffold
    
    return df

def get_contig_name_map(df):
    return {key:value for key,value in zip(contig_info.kk_name.values,contig_info.scaf_name.values)}

def save_multi_sheet_xls(dict_dfs, xls_path):
    writer = xls_writer(xls_path)
    for name,table in dict_dfs.items():
        table.to_excel(writer,sheet_name=name,index=False)
    writer.save()

In [41]:
# load contig info file
contig_info = pd.read_csv(contig_name_length_path)
contig_info.head()

Unnamed: 0,kk_name,scaf_name,length
0,KK352346.1,Scaffold566,193315
1,KK352610.1,Scaffold839,83110
2,KK352241.1,Scaffold458,243873
3,JFJR01012964.1,JFJR01012964.1,3083
4,KK352052.1,Scaffold268,427914


In [42]:
contig_name_map = get_contig_name_map(contig_info)
contig_name_map["KK351785.1"]

'Scaffold0'

In [49]:
contig_name_map["JFJR01006593.1"]

'JFJR01006593.1'

------------------

#  Convert `TajD_only_nonzero`

In [46]:
!head $TajD_only_nonzero_path

	CHROM	BIN_START	N_SNPS	TajimaD	chrom_num
0	GFvariants_VB2014a_tvcf:KK351785.1	13000	1	0.0	1
10	GFvariants_VB2014a_tvcf:KK351785.1	23000	4	2.35852	1
17	GFvariants_VB2014a_tvcf:KK351785.1	30000	1	0.0	1
31	GFvariants_VB2014a_tvcf:KK351785.1	44000	1	0.0	1
38	GFvariants_VB2014a_tvcf:KK351785.1	51000	1	0.0	1
43	GFvariants_VB2014a_tvcf:KK351785.1	56000	1	0.0	1
53	GFvariants_VB2014a_tvcf:KK351785.1	66000	1	0.0	1
65	GFvariants_VB2014a_tvcf:KK351785.1	78000	1	0.0	1
70	GFvariants_VB2014a_tvcf:KK351785.1	83000	1	0.0	1


In [43]:
TajD_only_nonzero = pd.read_csv(TajD_only_nonzero_path, sep='\t')
TajD_only_nonzero = recode_taj_chrom(TajD_only_nonzero, name_map=contig_name_map)
TajD_only_nonzero.head()

Unnamed: 0.1,Unnamed: 0,CHROM,BIN_START,N_SNPS,TajimaD,chrom_num
0,0,Scaffold0,13000,1,0.0,1
1,10,Scaffold0,23000,4,2.35852,1
2,17,Scaffold0,30000,1,0.0,1
3,31,Scaffold0,44000,1,0.0,1
4,38,Scaffold0,51000,1,0.0,1


### Write out file.

In [48]:
TajD_only_nonzero.to_excel(xls_writer(base_out_dir + "/TajD_only_nonzero.xls"),
                           sheet_name='Sheet1', 
                           na_rep='', float_format=None, columns=None, 
                           header=True, index=False, index_label=None, 
                           startrow=0, startcol=0, 
                           engine=None, merge_cells=True, encoding=None, inf_rep='inf')

-------------------

#  Convert `TableS1_Top10_summary`

In [84]:
TableS1_Top10_summary = pd.read_excel(TableS1_Top10_summary_path, sep='\t', sheetname=None)
TableS1_Top10_summary.keys()

[u'Infection_Overall', u'Infection_Overlap_inPopulations']

In [85]:
for sheet in  TableS1_Top10_summary.keys():
    TableS1_Top10_summary[sheet] = recode_Top10_summary_Scaffold(TableS1_Top10_summary[sheet], name_map=contig_name_map)

In [87]:
TableS1_Top10_summary['Infection_Overlap_inPopulations'].head()
# TableS1_Top10_summary['Infection_Overall'].head()

Unnamed: 0,SNP,Scaffold,Position,Included in top 5% alpha values
0,1,Scaffold203,479273,
1,2,Scaffold23,449944,
2,3,Scaffold2,2593209,
3,4,Scaffold73,149768,*
4,5,Scaffold355,306768,


### Write out file

In [95]:
xls_path = base_out_dir + "/TableS1_Top10_summary_kk_name_to_scaf_name.xls"
save_multi_sheet_xls(dict_dfs=TableS1_Top10_summary, xls_path=xls_path)

----------

#  Convert `FileS2_Env_outlierLoci`

### Write out file

In [100]:
FileS2_Env_outlierLoci = pd.read_excel(FileS2_Env_outlierLoci_path, sheetname=None)
FileS2_Env_outlierLoci.keys()

[u'Outlier_MSNB', u'Outliers_MSOT']

In [101]:
for sheet in  FileS2_Env_outlierLoci.keys():
    FileS2_Env_outlierLoci[sheet] = recode_Top10_summary_Scaffold(FileS2_Env_outlierLoci[sheet], name_map=contig_name_map)

In [102]:
FileS2_Env_outlierLoci['Outlier_MSNB'].head()

Unnamed: 0,SNP,Scaffold,Position
0,1,Scaffold8,1482581
1,2,Scaffold85,94213
2,3,Scaffold107,378349
3,4,Scaffold206,259639
4,5,Scaffold427,13828


In [103]:
xls_path = base_out_dir + "/FileS2_Env_outlierLoci_kk_name_to_scaf_name.xls"
save_multi_sheet_xls(dict_dfs=FileS2_Env_outlierLoci, xls_path=xls_path)