In [1]:
import numpy as np
import io
import os
import pandas as pd
from scipy.stats import chisquare
from scipy.stats import hypergeom
from scipy.stats import fisher_exact
import bdsg
import copy
import re 
from collections import Counter

# Tests

In [2]:
### test chi deux ###
f_exp = np.array([44, 24, 29, 3]) / 100 * 189
f_obs = np.array([43, 52, 54, 40])
print("chi deux : ", chisquare(f_obs=f_obs, f_exp=f_exp))

chi deux :  Power_divergenceResult(statistic=228.23515947653874, pvalue=3.3295585338846486e-49)


In [3]:
### test exact fisher ###
table = np.array([[6, 2], [1, 4]])
M = table.sum()
n = table[0].sum()
N = table[:, 0].sum()
start, end = hypergeom.support(M, n, N)
hypergeom.pmf(np.arange(start, end + 1), M, n, N)
np.array([0.01631702, 0.16317016, 0.40792541, 0.32634033, 0.08158508,
       0.004662])
res = fisher_exact(table, alternative='two-sided')
print("fisher : ", res.pvalue)

fisher :  0.10256410256410256


# Read VCF

In [4]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})


vcf_df = read_vcf("/home/yboulkaid/Documents/sample_data/pgtest.data/calls/samp_g0_0.vcf")
vcf_df.head(n = 20)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,ref,450,>1>4,T,G,133.927,PASS,"AT=>1>2>4,>1>3>4;DP=46",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/0:46:46,0:-2.378747,-15.294357,-107.194421:1..."
1,ref,578,>4>9,TTTGCGGGCCGTCAAGATGAACTGGTGCCTGTAGGATTATGTCCTC...,TTTGCGGGCCGTCAAGATGAACTGGTGCCTGTAGGATTATGTCCTC...,14.6171,PASS,"AT=>4>5>7>8>9,>4>5>6>8>9;DP=50",GT:DP:AD:GL:GQ:GP:XD:MAD,"1/0:50:25,25:-3.306431,-2.422690,-3.306431:16:..."
2,ref,1118,>9>12,A,G,177.783,PASS,"AT=>9>11>12,>9>10>12;DP=61",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/0:61:61,0:-2.701171,-20.002352,-141.869830:1..."
3,ref,1340,>12>14,GATCGGACTTCTTAACGGGTTCCTCACGTAGCGATCTCTACGGGAA...,G,116.981,PASS,"AT=>12>13>14,>12>14;DP=47",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/0:47:46,1:-2.770745,-13.991746,-90.540504:11..."
4,ref,1529,>14>19,C,CCCGATCTATGTCGAGGCTTTCGGGCAGGGCCGCTATTAACATCGT...,175.5,PASS,"AT=>14>19,>14>15>17>18>19,>14>15>16>18>19;DP=60",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/0:60:60,0,0:-2.835631,-19.908524,-19.908524,..."
5,ref,1711,>19>22,T,C,479.863,PASS,"AT=>19>21>22,>19>20>22;DP=51",GT:DP:AD:GL:GQ:GP:XD:MAD,"1/0:51:24,27:-49.865550,-2.356254,-42.742492:2..."
6,ref,1847,>22>25,G,C,319.771,PASS,"AT=>22>24>25,>22>23>25;DP=42",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/1:42:23,19:-34.186354,-2.686342,-43.263918:2..."
7,ref,2008,>25>28,T,A,151.725,PASS,"AT=>25>27>28,>25>26>28;DP=52",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/0:52:52,0:-2.331929,-17.027372,-120.914402:1..."
8,ref,2104,>28>31,C,G,1167.51,PASS,"AT=>28>30>31,>28>29>31;DP=51",GT:DP:AD:GL:GQ:GP:XD:MAD,"1/1:51:0,51:-118.606526,-16.717323,-2.331929:1..."
9,ref,2381,>31>34,G,T,137.098,PASS,"AT=>31>32>34,>31>33>34;DP=47",GT:DP:AD:GL:GQ:GP:XD:MAD,"0/0:47:47,0:-2.328292,-15.561016,-109.458909:1..."


In [5]:
# on va essayer de faire un giga tableau qui recense tous les chemins possibles OK (pour un éch mais normalement c'est les mêmes chemins possibles)
# une fois qu'on aura fait ce giga tableau, on va essayer de le transformer en giga matrice qui compte pour 1 éch les chemins pris
# une fois que ce sera fait on pourra transformer la giga matrice en giga table de contingence (?) qui recense la même chose mais pour tous les échantillons

In [6]:
# test simple pour voir comment ça marche la regex que je veux faire
# ok ça marche c cool

text = ' AT=>4>5>7>8>9,>4>9,>4>5>6>8>9;DP=57'

m = re.search('AT=>(.+?);DP', text)
if m:
    found = m.group(1)
    
print(found)


4>5>7>8>9,>4>9,>4>5>6>8>9


In [7]:
chemins_possibles = []
for i in range(len(vcf_df["INFO"])):
    text = vcf_df["INFO"][i]
    m = re.search('AT=>(.+?);DP', text)
    if m:
        found = m.group(1)
    chemins_possibles.append(found)
    
for i in range(len(chemins_possibles)):
    chemins_possibles[i] = chemins_possibles[i].split(',')
    
########################################################################
chemins_pris = []
for i in vcf_df["SAMPLE"]:
    found = i[0:3]
    chemins_pris.append(found)

for i in range(len(chemins_pris)):
    chemins_pris[i] = chemins_pris[i].split('/')
    for j in range(len(chemins_pris[i])):
        chemins_pris[i][j] = int(chemins_pris[i][j])
        
########################################################################
chemins_combines = copy.deepcopy(chemins_pris)

for i in range(len(chemins_pris)):
    chemins_combines[i][0] = chemins_possibles[i][chemins_pris[i][0]]
    chemins_combines[i][1] = chemins_possibles[i][chemins_pris[i][1]]
    
''' cas simple pour comprendre
print(chemins_pris[0])
chemins_pris[0][0] = chemins_possibles[0][chemins_pris[0][0]]
chemins_pris[0][1] = chemins_possibles[0][chemins_pris[0][1]]
print(chemins_pris[0])'''

chemins_possibles = sum(chemins_possibles, [])
chemins_pris = sum(chemins_pris, [])
chemins_combines = sum(chemins_combines, [])

In [8]:
# verification 

print('========== possibles ==========')
print(chemins_possibles[0:5])
print('============ pris =============')
print(chemins_pris[0:5])
print('========== combinés ===========')
print(chemins_combines[0:5])

['1>2>4', '>1>3>4', '4>5>7>8>9', '>4>5>6>8>9', '9>11>12']
[0, 0, 1, 0, 0]
['1>2>4', '1>2>4', '>4>5>6>8>9', '4>5>7>8>9', '9>11>12']


In [9]:
# on refait la même chose mais pour tous les VCF (?) 
# est-ce qu'on veut autant de *listes* 'chemins combinés' que de VCF ? Ou bien est-ce qu'on veut une sorte de df avec autant de *lignes* que de VCF ?

In [11]:
snarl_df = pd.DataFrame(columns=['snarl index', 'snarl', 'times taken'])
#snarl_df['snarl index'] = vcf_df['ID']
snarl_df['snarl'] = chemins_possibles 
#chemins_combines = sum(chemins_combines, []) 
snarl_df['times taken'] = 0 

snarl_df.head()

Unnamed: 0,snarl index,snarl,times taken
0,,1>2>4,0
1,,>1>3>4,0
2,,4>5>7>8>9,0
3,,>4>5>6>8>9,0
4,,9>11>12,0


In [25]:
combine_count = list((x,chemins_combines.count(x)) for x in set(chemins_combines))

for i in range(len(combine_count)):
    for j in range(len(chemins_pris)):
        if combine_count[i][0] == str(snarl_df['snarl'][j]):
            #snarl_df['times taken'][j] = combine_count[i][1]
            snarl_df.loc[j, "times taken"] = combine_count[i][1]
            #count_good += 1
        
snarl_df.head()

Unnamed: 0,snarl index,snarl,times taken
0,,1>2>4,2
1,,>1>3>4,0
2,,4>5>7>8>9,1
3,,>4>5>6>8>9,1
4,,9>11>12,2


In [None]:
# et mtn qu'on a un tout beau data frame on peut faire tout pareil mais en parsant tous les VCF :D 

In [27]:
all_vcf = os.listdir("/home/yboulkaid/Documents/sample_data/pgtest.data/calls/")
for i in all_vcf:
    read_vcf(i)


Files and directories in ' /home/yboulkaid/Documents/sample_data/pgtest.data/calls/ ' :
60


 # 'Play' with bdsg
##### am i playing with bdsg or is it playing with me

In [None]:
from bdsg.bdsg import HashGraph

gr = HashGraph()
seq = ["CGA", "TTGG", "CCGT", "C", "GT", "GATAA", "CGG", "ACA", "GCCG", "ATATAAC"]
n = []
for s in seq:
    n.append(gr.create_handle(s))

gr.create_edge(n[0], n[1])
gr.create_edge(n[1], n[2])
gr.create_edge(n[2], n[3])
gr.create_edge(n[2], n[4])
gr.create_edge(n[3], n[5])
gr.create_edge(n[5], n[6])
# Connect the end of n5 to the start of n8
gr.create_edge(n[5], n[8])
gr.create_edge(n[6], n[7])
gr.create_edge(n[6], n[8])
gr.create_edge(n[7], n[9])
gr.create_edge(n[8], n[9])
# Connect the end of n8 back around to the start of n5
gr.create_edge(n[8], n[5])

def next_node_list(handle):
    lis = []
    gr.follow_edges(handle, False, lambda y: lis.append(y))
    return lis

print(f'n0: {gr.get_sequence(n[0])}')
next_node = next_node_list(n[0])[0]
print(f'n1: {gr.get_sequence(next_node)}')
next_node = next_node_list(next_node)[0]
print(f'n2: {gr.get_sequence(next_node)}')

path = gr.create_path_handle("path")
gr.append_step(path, n[0])
gr.append_step(path, n[1])
gr.append_step(path, n[2])
gr.append_step(path, n[4])
gr.append_step(path, n[5])
gr.append_step(path, n[6])
gr.append_step(path, n[7])
gr.append_step(path, n[9])

In [None]:
from bdsg.bdsg import PackedGraph
brca2 = PackedGraph()
brca2.deserialize("/home/yboulkaid/Documents/sample_data/pgtest.data/pg.pg")
#brca2.deserialize("/home/yboulkaid/Téléchargements/cactus-brca2.pg")

path_handle = []
handles = []
brca2.for_each_path_handle(lambda y: path_handle.append(y) or True)
brca2.for_each_step_in_path(path_handle[0],
    lambda y: handles.append(brca2.get_handle_of_step(y)) or True)
sequence = ""
for handle in handles:
    sequence += brca2.get_sequence(handle)
print(sequence[0:10])
print(len(sequence))


In [None]:
type(gr)

# CACGTCCGAGAATCGG
# CACGTCCGAG

In [None]:
from bdsg.handlegraph import  HandleGraph
###########################################
### HOW TO ACCESS THE SNARL TREE (XIAN) ###
###########################################

#Load the distance index
distance_index = bdsg.bdsg.SnarlDistanceIndex()
distance_index.deserialize("/home/yboulkaid/Documents/sample_data/pgtest.data/pg.dist")

# net graph to build
ngraph = {'e': [], 'n': []}

# init with the child (only one ideally) of the root
root = distance_index.get_root() #Get a net handle referring to a tip-to-tip traversal of the contents of the root snarl.

def add_node_to_ngraph(net):
    ngraph['n'].append(net)
distance_index.for_each_child(root, add_node_to_ngraph)

distance_index.net_handle_as_string(ngraph['n'][0])

node3 = distance_index.get_node_net_handle(3)
parent3 = distance_index.get_parent(node3)
snarl_start_bound = distance_index.get_bound(parent3, False, True)

# for functions taking an iteratee
# print the net_handle_t and return true to continue iterating
def iterate (net) :
    print(distance_index.net_handle_as_string(net))
    return True

graph = HandleGraph()
#graph.deserialize("/home/yboulkaid/Documents/sample_data/pgtest.data/pg.pg")

print("Follow edges from", distance_index.net_handle_as_string(snarl_start_bound), ":" )
distance_index.follow_net_edges(snarl_start_bound, graph, False, iterate)


# graph: bdsg.handlegraph.HandleGraph

In [None]:
print(ngraph)
print(distance_index.net_handle_as_string(ngraph['n'][0]))

In [None]:
ngraph['n']

In [None]:
distance_index.deserialize("/home/yboulkaid/Documents/sample_data/pgtest.data/pg.dist")

In [None]:
'''get parent 
get child 
get bound 
get root'''