In [1]:
import os
import importlib
from timeit import timeit
import logging
import sys
import importlib
import time
import multiprocessing as mp
import multiprocessing.pool

import numpy as np
import pandas as pd
import scipy
import h5py
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numba
import sklearn.linear_model

import ms_utils
import browser
import interface
import ms_database
import ms_run_files

import sandbox

import line_profiler
profile = line_profiler.LineProfiler()
# heat.evolve = profile(heat.evolve)
# profile.print_stats()

def reload():
    importlib.reload(ms_run_files)
    importlib.reload(ms_utils)
    importlib.reload(browser)
    importlib.reload(interface)
    importlib.reload(sandbox)
    importlib.reload(ms_database)
   

In [27]:
def get_candidate_peptide_indices_for_nodes(
    inet,
    database,
    parameters
):
    # TODO: Docstring
    ms_utils.LOGGER.info(
        f"Writing node candidates to {inet.file_name}"
    )
    max_ppm = parameters["annotation_ppm"]
    self_mzs = inet.get_ion_coordinates("FRAGMENT_MZ")
    mz_order = np.argsort(self_mzs)
    database_mzs = database.get_fragment_coordinates("mz")
    mz_transform = np.log(self_mzs[mz_order]) * 10**6
    low_limits = np.searchsorted(
        np.log(database_mzs) * 10**6,
        mz_transform - max_ppm,
        "left"
    )
    high_limits = np.searchsorted(
        np.log(database_mzs) * 10**6,
        mz_transform + max_ppm,
        "right"
    )
    inv_order = np.argsort(mz_order)
    return low_limits[inv_order], high_limits[inv_order]


@numba.njit(nogil=True, cache=True)
def score_annotations(
    candidates,
    edge_contributions,
    indptr,
    indices,
    peptide_pointers,
    low_peptide_indices,
    high_peptide_indices,
    peptide_count,
):
    annotated_ions = np.empty(candidates.shape[0], np.int64)
    annotated_peptides = np.empty(candidates.shape[0], np.int64)
    annotated_scores = np.empty(candidates.shape[0], np.float64)
    current_annotation_index = 0
    for index in np.flatnonzero(candidates):
        local_edge_contributions = edge_contributions[indptr[index]: indptr[index + 1]]
        good_edges = np.flatnonzero(local_edge_contributions > 0)
        if good_edges.shape[0] == 0:
            continue
        neighbors = indices[indptr[index]: indptr[index + 1]][good_edges]
        local_edge_contributions = local_edge_contributions[good_edges]
        l = low_peptide_indices[index]
        h = high_peptide_indices[index]
        candidate_peptides = peptide_pointers[l:h]
        candidate_peptide_scores = np.zeros(peptide_count, np.int64)
        for edge_contribution, neighbor_index in zip(local_edge_contributions, neighbors):
            l = low_peptide_indices[neighbor_index]
            h = high_peptide_indices[neighbor_index]
            if l == h:
                continue
            neighbor_peptides = peptide_pointers[l:h]
            candidate_peptide_scores[neighbor_peptides] += edge_contribution
        candidate_peptide_scores = candidate_peptide_scores[candidate_peptides]
        hits = np.flatnonzero(candidate_peptide_scores)
        if hits.shape[0] == 0:
            continue
        candidate_peptides = candidate_peptides[hits]
        candidate_peptide_scores = candidate_peptide_scores[hits]
    #     count_frequency = np.bincount(candidate_peptide_counts)
    #     if count_frequency[-1] != 1:
    #         continue
    #     print(index, candidate_peptides, candidate_peptide_counts)
        max_index = np.argmax(candidate_peptide_scores)
        peptide = candidate_peptides[max_index]
        score = candidate_peptide_scores[max_index]
        annotated_ions[current_annotation_index] = index
        annotated_peptides[current_annotation_index] = peptide
        annotated_scores[current_annotation_index] = score
        current_annotation_index += 1
    annotated_ions = annotated_ions[:current_annotation_index]
    annotated_peptides = annotated_peptides[:current_annotation_index]
    annotated_scores = annotated_scores[:current_annotation_index]
    return annotated_ions, annotated_peptides, annotated_scores

In [28]:
inet = ms_run_files.HDF_Network_File(
    "/home/sander/Documents/Proteomics/data/ecoli_test/28Oct2016_060.inet.hdf"
)
evi = ms_run_files.HDF_Evidence_File(inet)
database = ms_database.HDF_Database_File(
    "/home/sander/Documents/Proteomics/data/databases/crap_ecoli_concatenated_decoy.hdf"
)
parameters = ms_utils.read_parameters_from_json_file(default="annotation")

In [29]:
low_peptide_indices, high_peptide_indices = get_candidate_peptide_indices_for_nodes(
    inet,
    database,
    parameters
)
peptide_pointers = database.get_fragment_coordinates("peptide_index")
indptr, indices, edge_pointers = inet.get_edges(symmetric=True, return_pointers=True)
positive_counts = evi.get_edges()
negative_counts = evi.get_edges(positive=False)
peptide_sequences = database.read_dataset("sequence", "peptides")
decoys = database.read_dataset("decoy", "peptides")

In [19]:
# edge_contributions = positive_counts[edge_pointers] == 9
# edge_contributions = positive_counts[edge_pointers] - negative_counts[edge_pointers]
edge_contributions = positive_counts[edge_pointers] >= 4
np.bincount(edge_contributions + 10)

array([        0,         0,         0,         0,         0,         0,
               0,         0,         0,         0, 103477684,   4368698])

In [20]:
candidates = (high_peptide_indices > low_peptide_indices)
thread_count = 8
with multiprocessing.pool.ThreadPool(thread_count) as p:
    results = p.starmap(
        score_annotations,
        [
            (
                candidates[i::thread_count],
                edge_contributions,
                indptr,
                indices,
                peptide_pointers,
                low_peptide_indices,
                high_peptide_indices,
                peptide_sequences.shape[0],
            ) for i in range(thread_count)
        ]
    )
annotated_ions = np.concatenate([r[0] for r in results])
annotated_peptides = np.concatenate([r[1] for r in results])
annotated_scores = np.concatenate([r[2] for r in results])
annotated_decoys = decoys[annotated_peptides]

In [26]:
for i in range(int(np.max(annotated_scores))):
    print(i, np.bincount(annotated_decoys[annotated_scores > i]))

0 [12658 13531]
1 [ 9700 10558]
2 [6798 7444]
3 [4431 5000]
4 [2931 3422]
5 [1710 2274]
6 [1083 1553]
7 [ 884 1083]
8 [638 842]
9 [371 625]
10 [288 414]
11 [263 280]
12 [212 178]
13 [141  71]
14 [76 42]
15 [55]
16 [46]


In [13]:
max_i = 519

print(np.argmax(annotated_scores))

annotated_ions[max_i], annotated_peptides[max_i], annotated_scores[max_i]

519


(42853, 6659, 10.0)

In [14]:
i = 42853
print(6659 in peptide_pointers[low_peptide_indices[i]: high_peptide_indices[i]])
p = np.flatnonzero(peptide_pointers == 6659)
p

True


array([  20031,  160604,  381583,  706580,  861205, 1184214, 1314618,
       1590766, 1711541, 1820608, 1934910, 2191393, 2296601, 2580344,
       2677683, 2932864, 3065615, 3224210])

In [15]:
# z = np.concatenate(
#     [
#         280718 in peptide_pointers[
#             low_peptide_indices[n]: high_peptide_indices[n]
#         ] for n in indices[indptr[i]: indptr[i+1]]
#     ]
# )
# # np.bincount(z)[p]
# z
l = 0
for n in indices[indptr[i]: indptr[i+1]]:
    l += np.any(np.isin(p, np.arange(low_peptide_indices[n], high_peptide_indices[n])))
#     l += 280718 in peptide_pointers[
#         low_peptide_indices[n]: high_peptide_indices[n]
#     ]
    
l

14

In [22]:
np.bincount(decoys)

array([111428, 111368])