# Getting Chr5 novel node statistics

This is more or less following the hpgp-basics doc, but through python rather than R.

First, we need to import stuff:

In [7]:
import os
import subprocess
import pandas as pd
import seaborn as sns

node_dir = "nonref_files/chr5-1k-10-50/"
nodes = os.listdir(node_dir)

sample_data = pd.read_csv("nonref_files/20130606_g1k_3202_samples_ped_population.txt", 
                         delimiter=" ")
sample_data["SampleID"] = sample_data["SampleID"].astype(str)

print(nodes[:3])

['4871018.txt', '3443998.txt', '1330802.txt']


Now we need to get all the path information for each node, and convert it into a dataframe so we can do something useful with it.

In [8]:
pop_counts = {}
node_data = ""
nid = ""

for node in nodes[:]: # only working on the first node for now
    if not node.endswith(".txt") : continue
    nid = node[:node.find(".")]
    paths = open(node_dir + node).read().split('\n')[:-2] # 2 trailing newlines
    
    counts = []
    # converting to a set makes sure homozygotes are only counted once
    for path in sorted(list(set(paths))):
        counts.append((path, paths.count(path)))
    
    # grab population counts
    test = pd.DataFrame(counts, columns=["SampleID", "count"])
    test["SampleID"] = test["SampleID"].astype(str)
    test = test.merge(sample_data[["SampleID", "Population"]], on="SampleID")
    pop_counts[nid] = test["Population"].value_counts()
    
    # we can work on the whole thing later, but let's just grab one node for now.
    node_data = test
     



We can now use `node_data` to get information about the populations in our node.

In [9]:
print(f"Counts for {nid}:")
print(node_data["Population"].value_counts())

Counts for 4971695:
Population
GWD    5
PEL    3
PUR    2
MSL    2
CHS    1
ESN    1
ASW    1
Name: count, dtype: int64


We can see that most of the paths represented are part of the GWD, PUR, and MSL populations.

We can then grab the path prefixes 

In [10]:
populations = pd.read_csv("data/20131219.populations.tsv", delimiter="\t")
print(populations[populations["Population Code"] == "GWD"]["Population Description"])



print(node_data[node_data["Population"]=="GWD"])

path_prefixes = node_data[node_data["Population"]=="GWD"]["SampleID"].tolist()

13    Gambian in Western Division, The Gambia
Name: Population Description, dtype: object
   SampleID  count Population
6   HG02572      1        GWD
7   HG02622      1        GWD
8   HG02630      1        GWD
9   HG02717      1        GWD
10  HG02886      2        GWD


If we want to, we can then extract just the node (and the ones surrounding for convenience/if we want to use bandage/etc to visualise later). In theory we could then use this to visually determine if there was any variation within this node (?).

In [11]:
node_og = f"nonref_files/chr5-1k-10-50/node_{nid}.og"

subprocess.run(["./bin/odgi", "extract", "-i", "chr5.full.og", "-n", nid,
                "-o", node_og, "-t", "4"]);

In [12]:
paths = subprocess.run(["./bin/odgi", "paths", "-i", node_og,
                       "-L"], capture_output=True).stdout.decode().split()
        
paths = [p for p in paths if p[:p.find("#")] in path_prefixes]
print(paths)

f = open("_paths.txt", "w") #remove later

for p in paths:
    f.write(p + "\n")

f.close()

['HG02572#1#JAHAOW010000129.1#0:72387-73390', 'HG02622#2#JAHAON010000014.1#0:104211456-104212459', 'HG02630#1#JAHAOQ010000041.1#0:14798010-14799013', 'HG02717#1#JAHAOS010000011.1#0:27717791-27718794', 'HG02886#1#JAHAOU010000019.1#0:473261-474264', 'HG02886#2#JAHAOT010000128.1#0:11943547-11944550']


## Looking at Populations

Let's have a look at populations - by doing some data transformation, we can see how many paths of each population are in each node.

Let's also get an overview of the population codes.

In [13]:
populations[["Population Description", "Population Code", "Super Population"]].dropna()

Unnamed: 0,Population Description,Population Code,Super Population
0,"Chinese Dai in Xishuangbanna, China",CDX,EAS
1,"Han Chinese in Bejing, China",CHB,EAS
2,"Japanese in Tokyo, Japan",JPT,EAS
3,"Kinh in Ho Chi Minh City, Vietnam",KHV,EAS
4,"Southern Han Chinese, China",CHS,EAS
5,Bengali in Bangladesh,BEB,SAS
6,"Gujarati Indian in Houston,TX",GIH,SAS
7,Indian Telugu in the UK,ITU,SAS
8,"Punjabi in Lahore,Pakistan",PJL,SAS
9,Sri Lankan Tamil in the UK,STU,SAS


In [14]:
# some completely unnecessary data transformation - may be interesting later...    
node_pop_df = pd.DataFrame.from_dict(pop_counts).reset_index()
node_pop_df = pd.pivot_table(node_pop_df, columns="Population").reset_index(names="node_id").fillna(0)

node_pop_df.head()

Population,node_id,ACB,ASW,CHS,CLM,ESN,GWD,KHV,MSL,PEL,PJL,PUR,YRI
0,1030533,2.0,1.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,2.0,1.0
1,1030773,2.0,1.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,2.0,1.0
2,1031072,2.0,1.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,3.0,1.0
3,1330802,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0
4,1330842,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0


The natural next step for this is to perform the same operation across every chromosome :)

## (In theory, Leading to the admixture stuff)

We can also sort by paths of a given population:

In [15]:
asw_samples = node_pop_df[node_pop_df["ASW"]>0] # filter for only nodes present in the ASW sample

# there's no point in sorting in this specific case as there's only 1 ASW sample, but keeping for reference.
asw_samples.sort_values(by=["ASW"], ascending=False).head()

Population,node_id,ACB,ASW,CHS,CLM,ESN,GWD,KHV,MSL,PEL,PJL,PUR,YRI
0,1030533,2.0,1.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,2.0,1.0
1,1030773,2.0,1.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,2.0,1.0
55,4960359,0.0,1.0,1.0,0.0,1.0,5.0,0.0,2.0,3.0,0.0,2.0,0.0
56,4960603,0.0,1.0,1.0,0.0,1.0,5.0,0.0,2.0,3.0,0.0,2.0,0.0
57,4971420,0.0,1.0,1.0,0.0,1.0,5.0,0.0,2.0,3.0,0.0,2.0,0.0


Sadly, there's only 1 ASW haplotype in the current pangenome. To get more, we'll need to use `vg` to map alignments to these nodes.

The first step for this is to get a list of all the samples for the population we're interested in:

In [17]:
asw_samples = sample_data[sample_data["Population"]=="ASW"]

asw_samples

Unnamed: 0,FamilyID,SampleID,FatherID,MotherID,Sex,Population,Superpopulation
2821,2357,NA19625,0,0,2,ASW,AFR
2855,2367,NA19700,0,0,1,ASW,AFR
2856,2367,NA19701,0,0,2,ASW,AFR
2857,2367,NA19702,NA19700,NA19701,1,ASW,AFR
2858,2368,NA19703,0,0,1,ASW,AFR
...,...,...,...,...,...,...,...
2987,2494,NA20357,0,0,2,ASW,AFR
2988,2494,NA20358,NA20356,NA20357,1,ASW,AFR
2989,2495,NA20359,0,0,2,ASW,AFR
2990,2495a,NA20362,0,0,1,ASW,AFR
