# Imports

In [1]:
# Std lib
from collections import namedtuple, Counter, defaultdict
from itertools import islice
import random
import glob
from pprint import pprint as pp

# Third party
import numpy as np
import pandas as pd
from matplotlib import pyplot as pl
from pycl.pycl import *
from pybedtools import BedTool
from pyfaidx import Fasta, Faidx

# Matplotlib and pandas setup
%matplotlib inline
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200

# Dev whitelist function

In [76]:
head ("/home/aleg/Analyses/Nanopore_yeast/nanopolish/nanopolish_read_raw_collapsed.tsv.idx")

ref_id  ref_start ref_end read_id                              kmers NNNNN_kmers mismatching_kmers missing_kmers offset 
YHR055C 0         165     7ef1d7b9-5824-4382-b23b-78d82c07ebbd 155   6           0                 10            0      
YHR055C 1         182     68392b1f-4591-4917-8d56-53f3a8003cd6 178   3           0                 3             10478  
YHR055C 0         182     be088f2c-0c1a-434c-bd32-a8a516da476a 165   10          0                 17            22534  
YHR055C 3         171     eba44c90-5209-4216-8b75-67b7abd0a25f 160   6           0                 8             33722  
YHR055C 0         182     cf54e40f-af41-4450-a32c-584d8bd3204d 165   10          0                 17            44557  
YOL138C 3609      4019    e0701bb5-012d-4f1e-8043-8a12dcaa49ab 390   13          0                 20            55794  
YPR165W 293       626     ca16553b-fbd6-4539-8a4d-9826340eb380 306   16          0                 27            82643  
YOL138C 3131      4015    609b08

In [66]:
def whitelist (
    s1_fn,
    s2_fn,
    fasta_index_fn = None,
    min_cov = 10,
    max_NNNNN_kmers_freq = 0.2,
    max_mismatching_kmers_freq = 0.2,
    max_missing_kmers_freq = 0.2,
    verbose=True):

    # Read fasta index
    if verbose: print ("Read fasta index")
    ref_len_dict = _read_fasta_index (fn=fasta_index_fn)
    if verbose: print (f"\tTotal references: {len(ref_len_dict)}")

    # Create reference index for both files
    if verbose: print ("Read eventalign index files")
    ref_reads = _read_eventalign_index (s1_fn, s2_fn, max_NNNNN_kmers_freq, max_mismatching_kmers_freq, max_missing_kmers_freq)
    if verbose: print (f"\tTotal references found {len(ref_reads)}")
    
    # Intersect both samples
    if verbose: print ("Filter out references with low coverage")
    ref_reads = _select_ref (ref_reads=ref_reads, min_cov=min_cov)
    if verbose: print (f"\tTranscripts remaining after reference coverage filtering: {len(ref_reads)}")
    
    if verbose: print ("Compute coverage per reference and select intervals with high enough coverage")
    ref_interval_reads = OrderedDict ()
    for ref_id, sample_reads in ref_reads.items ():
        # Compute reference coverage
        cov_array = _compute_ref_cov (sample_reads=sample_reads, ref_len=ref_len_dict[ref_id])
        # Get coordinates of intervals with minimum coverage
        valid_interval_list = _get_valid_intervals (cov_array, min_cov)
        # Intesect reads with valid coverage for both samples
        ref_interval_reads [ref_id] = _intersect_reads_interval (valid_interval_list, sample_reads)
    
    return ref_interval_reads
        
def _read_fasta_index (fn):
    ref_len = OrderedDict ()
    with open (fn) as fp:
        for line in fp:
            ls = line.rstrip().split()
            ref_len[ls[0]] = int(ls[1])
    return ref_len

def _read_eventalign_index (s1_fn, s2_fn, max_NNNNN_kmers_freq, max_mismatching_kmers_freq, max_missing_kmers_freq):
    ref_reads = OrderedDict ()
    
    for lab, fn in ("S1", s1_fn), ("S2", s2_fn):
        with open (fn) as fp:
            # get field names from header
            header = fp.readline().rstrip().split()
            line_tuple = namedtuple("line_tuple", header)
            c = Counter ()
            for line in fp:
                ls = line.rstrip().split()
                lt = line_tuple (ls[0], int(ls[1]), int(ls[2]), ls[3], int(ls[4]), int(ls[5]) , int(ls[6]) , int(ls[7]) , int(ls[8]))
                # filter out reads with high number of problematic kmers
                if max_NNNNN_kmers_freq and lt.NNNNN_kmers/lt.kmers > max_NNNNN_kmers_freq:
                    c ["high NNNNN_kmers reads"] += 1
                elif max_mismatching_kmers_freq and lt.mismatching_kmers/lt.kmers > max_mismatching_kmers_freq:
                    c ["high mismatching_kmers reads"] += 1
                elif max_missing_kmers_freq and lt.missing_kmers/lt.kmers > max_missing_kmers_freq:
                    c ["high missing_kmers reads"] += 1
                # Save valid reads
                else:
                    if not lt.ref_id in ref_reads:
                        ref_reads[lt.ref_id] = OrderedDict ()
                    if not lab in ref_reads [lt.ref_id]:
                        ref_reads[lt.ref_id][lab] = []
                    ref_reads[lt.ref_id][lab].append (lt)
                    c ["valid reads"] += 1
        print (c)
    return ref_reads

def _select_ref (ref_reads, min_cov):
    invalid_ref = []
    for ref_id, sample_reads in ref_reads.items ():
        if len(sample_reads) < 2 or len (sample_reads["S1"]) < min_cov or len (sample_reads["S2"]) < min_cov:
            invalid_ref.append (ref_id)
    for ref_id in invalid_ref:
        del ref_reads [ref_id]
    return ref_reads

def _compute_ref_cov (sample_reads, ref_len):
    cov_array = np.zeros ((2, ref_len))
    for read in sample_reads["S1"]:
        cov_array [0][np.arange(read.ref_start, read.ref_end)] += 1
    for read in sample_reads["S2"]:
        cov_array [1][np.arange(read.ref_start, read.ref_end)] += 1
    return cov_array
        
def _get_valid_intervals (cov_array, min_cov):
    valid_cov = False
    valid_interval_list = []
    for pos, (cov1, cov2) in enumerate (cov_array.T):
        # If coverage insuficient
        if cov1 < min_cov or cov2 < min_cov:
            if valid_cov:
                valid_interval_list.append ((ref_start, ref_end))
            valid_cov = False
        # If the coverage is high enough for both samples
        else:
            if valid_cov:
                ref_end = pos
            else:
                ref_start = ref_end = pos
                valid_cov = True
    # Last valid interval exception
    if valid_cov:
        valid_interval_list.append ((ref_start, ref_end))
    
    return valid_interval_list

def _intersect_reads_interval (valid_interval_list, sample_reads):
    
    ref_interval_reads = OrderedDict ()
    for interval_start, interval_end in valid_interval_list:
        ref_interval_reads [(interval_start, interval_end)] = {"S1":[], "S2":[]}
    
    for sample_id, read_list in sample_reads.items():
        for read in read_list:
            for interval_start, interval_end in valid_interval_list: 
                if read.ref_end >= interval_start and read.ref_start <= interval_end:
                    ref_interval_reads[(interval_start, interval_end)][sample_id].append (read)
    return ref_interval_reads

In [75]:
sample1_fn = "/home/aleg/Analyses/RNA_Yeast_TRM5/eventalign/KO/eventalign_collapsed.tsv.idx"
sample2_fn = "/home/aleg/Analyses/RNA_Yeast_TRM5/eventalign/WT/eventalign_collapsed.tsv.idx"
fasta_index_fn = "/home/aleg/Analyses/Nanopore_yeast/references/SC_R64-1-1_transcripts.fa.fai"
w  = whitelist (sample1_fn, sample2_fn, fasta_index_fn, min_cov=5, max_NNNNN_kmers_freq=0.1, max_mismatching_kmers_freq=0.1, max_missing_kmers_freq=0.1)

Read fasta index
	Total references: 6713
Read eventalign index files
Counter({'valid reads': 11073, 'high missing_kmers reads': 927, 'high mismatching_kmers reads': 344, 'high NNNNN_kmers reads': 84})
Counter({'valid reads': 10802, 'high missing_kmers reads': 702, 'high mismatching_kmers reads': 384, 'high NNNNN_kmers reads': 84})
	Total references found 2126
Filter out references with low coverage
	Transcripts remaining after reference coverage filtering: 172
Compute coverage per reference and select intervals with high enough coverage


In [2]:
from nanocompore.whitelist import whitelist

In [3]:
sample1_fn = "/home/aleg/Analyses/RNA_Yeast_TRM5/eventalign/KO/eventalign_collapsed.tsv.idx"
sample2_fn = "/home/aleg/Analyses/RNA_Yeast_TRM5/eventalign/WT/eventalign_collapsed.tsv.idx"
fasta_index_fn = "/home/aleg/Analyses/Nanopore_yeast/references/SC_R64-1-1_transcripts.fa.fai"
w  = whitelist (sample1_fn, sample2_fn, fasta_index_fn, min_cov=5, max_NNNNN_kmers_freq=0.1, max_mismatching_kmers_freq=0.1, max_missing_kmers_freq=0.1)

Read fasta index
	Total references: 6713
Read eventalign index files
Counter({'valid reads': 11073, 'high missing_kmers reads': 927, 'high mismatching_kmers reads': 344, 'high NNNNN_kmers reads': 84})
Counter({'valid reads': 10802, 'high missing_kmers reads': 702, 'high mismatching_kmers reads': 384, 'high NNNNN_kmers reads': 84})
	Total references found 2126
Filter out references with low coverage
	Transcripts remaining after reference coverage filtering: 172
Compute coverage per reference and select intervals with high enough coverage


In [5]:
for ref_id, interval_dict in islice(w.items(), 5):
    jprint (ref_id, bold=True, size=150)
    for interval, sample_dict in interval_dict.items():
        jprint (interval, bold=True, size=125)
        for sample_id, reads in sample_dict.items():
            jprint (sample_id, bold=True, size=100)
            with pd.option_context("display.max_rows",4):
                display (pd.DataFrame(reads))

Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YEL009C,0,733,d45afb2d-e765-4d8d-a6b2-026ea448d453,706,21,0,28,423128
1,YEL009C,83,842,6dcb7583-307f-4769-9a2d-1aab718a37e9,728,24,0,31,470265
...,...,...,...,...,...,...,...,...,...
7,YEL009C,0,839,8bea53b3-841e-4415-aa00-a4f159c063a5,810,21,0,29,619529
8,YEL009C,573,818,2dd8f547-84e1-4d24-a3d9-0e3c559e234d,229,11,0,16,674274


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YEL009C,0,842,01339779-05cb-43f4-9b83-b171eba77e51,807,26,0,35,1088811
1,YEL009C,0,842,3f1ba465-4894-4f91-bcda-c2b1c52bd4df,810,25,0,32,1296498
...,...,...,...,...,...,...,...,...,...
7,YEL009C,324,842,7ea08d9f-b50b-459c-9d88-21061864562b,492,11,0,26,1600108
8,YEL009C,368,842,6336d55c-6ff3-4ac4-b01f-2bffabeedb9a,453,15,0,21,1643927


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YKL096W-A,0,275,72a55fb1-0863-45ab-94db-48f0e01af82c,253,11,0,22,2839303
1,YKL096W-A,0,275,eb212de4-9520-428f-b9de-95845442ff99,267,5,0,10,2883565
...,...,...,...,...,...,...,...,...,...
15,YKL096W-A,14,275,ca642bf1-b2a9-4358-94b6-4b498cdf7188,247,6,0,14,3144535
16,YKL096W-A,63,275,24bacd39-e0f3-409d-96cb-88eec11ebcd3,201,7,0,11,3161260


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YKL096W-A,0,275,44080512-cd67-4e10-998c-1a7e15f453cd,263,10,0,12,8064646
1,YKL096W-A,1,257,a4896f87-d3b2-404d-8087-11a97f3c3d47,243,10,0,13,8194848
...,...,...,...,...,...,...,...,...,...
35,YKL096W-A,49,270,75c8b844-5901-4349-84bb-6c83cb640e48,210,7,0,11,8906839
36,YKL096W-A,73,274,08bcfb5b-5fce-48ba-924e-50348b8599cd,199,4,0,2,8921079


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YML063W,158,758,050c36f2-6c2f-41be-8927-d6e528c0cf87,554,22,0,46,3203360
1,YML063W,338,762,e368b7e4-f5b2-4ad9-a226-7f4d483435d8,412,9,0,12,3274550
...,...,...,...,...,...,...,...,...,...
4,YML063W,383,764,1328dc58-48ea-403e-b3cf-a5d5887dea4a,364,9,0,17,3404279
5,YML063W,417,760,c0ada0cb-f188-4772-827c-a32ac3dd544d,315,17,0,28,3428985


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YML063W,1,764,3ad89652-2046-4bb8-a694-63a7fee3840f,733,21,0,30,8974082
1,YML063W,2,764,4623c9c4-f3f1-4732-bb4c-004b43b9948b,734,16,0,28,9023712
...,...,...,...,...,...,...,...,...,...
18,YML063W,404,760,2347ba66-5b65-4157-a71c-5018cea0e950,335,17,0,21,9927990
19,YML063W,419,764,29940404-aa22-4b0e-b778-2bf91e8cdc2b,337,5,0,8,9950767


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YGR034W,1,380,25778aeb-98b5-46b9-b3c4-5e58c69c324f,361,13,0,18,3701262
1,YGR034W,3,380,b0ad0c9e-b12b-487e-a845-4f835782a437,362,11,0,15,3725701
...,...,...,...,...,...,...,...,...,...
3,YGR034W,89,380,560eabfb-9b09-44cf-959d-3a72aee337ce,273,8,0,18,3787467
4,YGR034W,130,380,74995b4d-527b-44c7-88d8-fd6e62b2e1a8,237,6,0,13,3809840


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YGR034W,0,380,6b2cda5d-1512-4b97-b5f5-aef05fa9735e,368,7,0,12,10581029
1,YGR034W,1,380,29429a6a-1ffc-4d45-944e-e1f10c27b0eb,367,9,0,12,10680972
...,...,...,...,...,...,...,...,...,...
30,YGR034W,137,380,40efcb2c-283b-41d2-987e-959b44d77c34,231,7,0,12,11428687
31,YGR034W,163,377,95e82171-c9ca-4ece-bfc8-b465990a92c5,199,5,0,18,11444424


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YLR029C,0,470,0804b07f-1c31-42b6-985a-4a11b9011d20,453,14,0,19,4376531
1,YLR029C,0,610,adda5537-025f-43a0-972a-60ba89dc40f2,578,21,0,32,4438036
...,...,...,...,...,...,...,...,...,...
15,YLR029C,398,608,b86c28e8-9062-4b75-b0ef-cf29d9200289,199,9,0,11,4840576
16,YLR029C,391,611,6acdbc82-ece7-41bc-af06-e4b4fe8a17d4,208,8,0,12,4854175


Unnamed: 0,ref_id,ref_start,ref_end,read_id,kmers,NNNNN_kmers,mismatching_kmers,missing_kmers,offset
0,YLR029C,1,607,10527aad-5376-46f8-8c71-c76a4ff995e7,569,26,0,37,12162574
1,YLR029C,0,608,126197cd-7d50-49ef-8683-364b7c7b1a7f,578,25,0,30,12266590
...,...,...,...,...,...,...,...,...,...
18,YLR029C,347,600,ad57d386-f757-4f72-9699-c842a73b066a,236,12,0,17,12835792
19,YLR029C,345,608,95fe9c76-27cc-4833-8851-f98bb6b1a927,256,7,0,7,12851876
