# Population level coancestry matrix

In this notebook we will generate a coancestry matrix between populations. 
Those populations were arbitrarily decided from the fineStructure dendrogram elsewhere.
We estimated the Total Variation Distance (TVD) as a measure of the difference (or genetic distance) between clusters

## Preliminaries

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
#Folders
projpath = os.path.realpath("..")
pathfs   = os.path.join(projpath, "DataBases", "Genotypes", "FineStructure")
pathres  = os.path.join(projpath, "Results")

In [3]:
#Opening the total samples file
os.chdir(pathres)
dat = pd.read_csv("tot_samples.csv")

In [4]:
#Open the chunk count file
os.chdir(pathfs)
chunkout = pd.read_csv("total_fs_linked.chunkcounts.out", sep=" ", 
                       skiprows=1, index_col=0,
                       dtype={0:'str'})
chunklength = pd.read_csv("total_fs_linked.chunklengths.out", sep=" ", 
                       index_col=0,
                       dtype={0:'str'})

## Generate the matrix

Calculate TVDs

In [5]:
nclusters   = max(dat['cluster_initial'])
tvds        = pd.DataFrame(np.zeros([nclusters,nclusters]))
total_sizes = np.array(chunkout).sum(axis = 1)

In [6]:
for i in range(nclusters):
    ids_pop1 = dat['ID'][dat['cluster_initial'] == i + 1]
    for j in range(nclusters):
        if j > i:
            ids_pop2 = dat['ID'][dat['cluster_initial'] == j + 1]
            abs_diff = np.zeros(11-2)
            pos = 0
            for k in range(nclusters):
                if (k != i) and (k != j):
                    ids_pop_t = dat['ID'][dat['cluster_initial'] == k + 1]
                    cop_vec1 = (np.array(chunkout.loc[list(ids_pop1), list(ids_pop_t)]).sum(axis = 1) / total_sizes[ids_pop1.index]).mean()
                    cop_vec2 = (np.array(chunkout.loc[list(ids_pop2), list(ids_pop_t)]).sum(axis = 1) / total_sizes[ids_pop2.index]).mean()
                    val      = abs(cop_vec1 - cop_vec2)
                    abs_diff[pos] = val
                    pos = pos + 1
            tvds.iloc[j,i] = abs_diff.sum() * 0.5
        


In [7]:
tvds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.07877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.05596,0.040374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.199845,0.161213,0.180239,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.213002,0.180899,0.200435,0.105739,0.0,0.0,0.0,0.0,0.0,0.0
5,0.195847,0.161835,0.183389,0.049365,0.062298,0.0,0.0,0.0,0.0,0.0
6,0.168335,0.132381,0.150083,0.067264,0.076548,0.056871,0.0,0.0,0.0,0.0
7,0.07167,0.108363,0.097827,0.03876,0.076131,0.03893,0.058547,0.0,0.0,0.0
8,0.170857,0.160463,0.162862,0.050456,0.097331,0.044785,0.088234,0.003078,0.0,0.0
9,0.121941,0.128979,0.107287,0.125349,0.062516,0.097444,0.083888,0.043941,0.111313,0.0


In [8]:
os.chdir(pathres)
tvds.to_csv('tvds.csv', index=False, header=False)