In [1]:
import os
import importlib
from timeit import timeit
import logging
import sys
import importlib
import time
import multiprocessing as mp
import multiprocessing.pool

import numpy as np
import pandas as pd
import scipy
import h5py
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numba
import sklearn.linear_model

import ms_utils
import browser
import interface
import ms_database
import ms_run_files

import sandbox

import line_profiler
profile = line_profiler.LineProfiler()
# heat.evolve = profile(heat.evolve)
# profile.print_stats()

def reload():
    importlib.reload(ms_run_files)
    importlib.reload(ms_utils)
    importlib.reload(browser)
    importlib.reload(interface)
    importlib.reload(sandbox)
    importlib.reload(ms_database)
   

In [2]:
def get_candidate_peptide_indices_for_nodes(
    inet,
    database,
    parameters
):
    # TODO: Docstring
    ms_utils.LOGGER.info(
        f"Writing node candidates to {inet.file_name}"
    )
    max_ppm = parameters["annotation_ppm"]
    self_mzs = inet.get_ion_coordinates("FRAGMENT_MZ")
    mz_order = np.argsort(self_mzs)
    database_mzs = database.get_fragment_coordinates("mz")
    low_limits = np.searchsorted(
        np.log(database_mzs) * 10**6,
        np.log(self_mzs[mz_order]) * 10**6 - max_ppm,
        "left"
    )
    high_limits = np.searchsorted(
        np.log(database_mzs) * 10**6,
        np.log(self_mzs[mz_order]) * 10**6 + max_ppm,
        "right"
    )
    inv_order = np.argsort(mz_order)
    return low_limits[inv_order], high_limits[inv_order]


@numba.njit(nogil=True, cache=True)
def score_annotations(
    candidates,
    edge_contributions,
    indptr,
    indices,
    peptide_pointers,
    low_peptide_indices,
    high_peptide_indices,
    peptide_count,
):
    annotated_ions = np.empty(candidates.shape[0], np.int64)
    annotated_peptides = np.empty(candidates.shape[0], np.int64)
    annotated_scores = np.empty(candidates.shape[0], np.float64)
    current_annotation_index = 0
    for index in np.flatnonzero(candidates):
        local_edge_contributions = edge_contributions[indptr[index]: indptr[index + 1]]
        good_edges = np.flatnonzero(local_edge_contributions > 0)
        if good_edges.shape[0] == 0:
            continue
        neighbors = indices[indptr[index]: indptr[index + 1]][good_edges]
        local_edge_contributions = local_edge_contributions[good_edges]
        l = low_peptide_indices[index]
        h = high_peptide_indices[index]
        candidate_peptides = peptide_pointers[l:h]
        candidate_peptide_scores = np.zeros(peptide_count, np.int64)
        for edge_contribution, neighbor in zip(local_edge_contributions, neighbors):
            l = low_peptide_indices[neighbor]
            h = high_peptide_indices[neighbor]
            if l == h:
                continue
            neighbor_peptides = peptide_pointers[l:h]
            candidate_peptide_scores[neighbor_peptides] += edge_contribution
        candidate_peptide_scores = candidate_peptide_scores[candidate_peptides]
        hits = np.flatnonzero(candidate_peptide_scores)
        if hits.shape[0] == 0:
            continue
        candidate_peptides = candidate_peptides[hits]
        candidate_peptide_scores = candidate_peptide_scores[hits]
    #     count_frequency = np.bincount(candidate_peptide_counts)
    #     if count_frequency[-1] != 1:
    #         continue
    #     print(index, candidate_peptides, candidate_peptide_counts)
        max_index = np.argmax(candidate_peptide_scores)
        peptide = candidate_peptides[max_index]
        score = candidate_peptide_scores[max_index]
        annotated_ions[current_annotation_index] = index
        annotated_peptides[current_annotation_index] = peptide
        annotated_scores[current_annotation_index] = score
        current_annotation_index += 1
    annotated_ions = annotated_ions[:current_annotation_index]
    annotated_peptides = annotated_peptides[:current_annotation_index]
    annotated_scores = annotated_scores[:current_annotation_index]
    return annotated_ions, annotated_peptides, annotated_scores

In [3]:
inet = ms_run_files.Network(
    "/home/sander/Documents/Proteomics/data/ecoli/28Oct2016_060.inet.hdf"
)
evi = ms_run_files.Evidence(inet)
database = ms_database.Database(
    "/home/sander/Documents/Proteomics/data/databases/crap_ecoli_concatenated_decoy.hdf"
)
parameters = ms_utils.read_parameters_from_json_file(default="annotation")

In [4]:
low_peptide_indices, high_peptide_indices = get_candidate_peptide_indices_for_nodes(
    inet,
    database,
    parameters
)
peptide_pointers = database.get_fragment_coordinates("peptide_index")
indptr, indices, edge_pointers = inet.get_edges(symmetric=True, return_pointers=True)
positive_counts = evi.get_edges()
negative_counts = evi.get_edges(positive=False)
edge_contributions = positive_counts[edge_pointers] - negative_counts[edge_pointers]
# edge_contributions = positive_counts == 9
peptide_sequences = database.read_dataset("sequence", "peptides")
protein_pointers = database.read_dataset("proteins", "peptides")
proteins = database.read_dataset("protein", "proteins")
decoys = np.array([protein.startswith("DECOY_") for protein in proteins])

In [5]:
# edge_contributions = positive_counts[edge_pointers] == 9
edge_contributions = positive_counts[edge_pointers] - negative_counts[edge_pointers]
np.bincount(edge_contributions + 10)

array([        0,      8006,     17328,     44206,     96090,    231480,
          577286,   1569568,   4527236,  16172012, 144492062,   8840496,
         4477222,   3177662,   2470940,   2064664,   1739086,   1668970,
         1404830,   1746662])

In [6]:
candidates = (high_peptide_indices > low_peptide_indices)
thread_count = 8
with multiprocessing.pool.ThreadPool(thread_count) as p:
    results = p.starmap(
        score_annotations,
        [
            (
                candidates[i::thread_count],
                edge_contributions,
                indptr,
                indices,
                peptide_pointers,
                low_peptide_indices,
                high_peptide_indices,
                peptide_sequences.shape[0],
            ) for i in range(thread_count)
        ]
    )
annotated_ions = np.concatenate([r[0] for r in results])
annotated_peptides = np.concatenate([r[1] for r in results])
annotated_scores = np.concatenate([r[2] for r in results])
annotated_protein_pointers = protein_pointers[annotated_peptides]

In [7]:
unique_proteins = [
    i for i, prot in enumerate(annotated_protein_pointers) if ";" not in prot
]
selected_annotated_ions = annotated_ions[unique_proteins]
selected_annotated_peptides = annotated_peptides[unique_proteins]
selected_annotated_scores = annotated_scores[unique_proteins]
selected_annotated_protein_pointers = annotated_protein_pointers[unique_proteins].astype(np.int64)
selected_annotated_decoys = decoys[selected_annotated_protein_pointers]

In [8]:
for i in range(int(np.max(selected_annotated_scores))):
    print(i, np.bincount(selected_annotated_decoys[selected_annotated_scores > i]))

0 [94397 96118]
1 [69003 69998]
2 [56556 57033]
3 [49304 49399]
4 [43171 43305]
5 [39039 38930]
6 [35169 35167]
7 [32060 32135]
8 [29292 29357]
9 [27100 27169]
10 [25263 25163]
11 [23866 23657]
12 [22104 21896]
13 [20864 20568]
14 [19500 19163]
15 [18257 17893]
16 [16937 16492]
17 [15906 15428]
18 [14675 14074]
19 [13826 13247]
20 [12975 12444]
21 [12246 11725]
22 [11522 10965]
23 [10771 10238]
24 [10054  9534]
25 [9326 8869]
26 [8633 8311]
27 [8054 7718]
28 [7567 7231]
29 [7184 6811]
30 [6707 6375]
31 [6411 6009]
32 [6028 5602]
33 [5639 5250]
34 [5241 4907]
35 [4935 4622]
36 [4612 4318]
37 [4322 4074]
38 [4008 3864]
39 [3807 3651]
40 [3676 3414]
41 [3484 3246]
42 [3259 3067]
43 [3079 2853]
44 [2980 2665]
45 [2779 2462]
46 [2669 2338]
47 [2533 2210]
48 [2429 2055]
49 [2298 1974]
50 [2198 1799]
51 [2039 1690]
52 [1934 1619]
53 [1850 1559]
54 [1791 1490]
55 [1701 1414]
56 [1636 1376]
57 [1590 1302]
58 [1502 1253]
59 [1463 1195]
60 [1392 1176]
61 [1362 1126]
62 [1322 1080]
63 [1241 1044]


In [9]:
np.bincount(selected_annotated_scores.astype(np.int))

array([    0, 51514, 25412, 14886, 12227,  8507,  7633,  6141,  5546,
        4380,  3843,  2903,  3523,  2568,  2769,  2513,  2721,  2095,
        2585,  1676,  1654,  1448,  1484,  1478,  1421,  1393,  1251,
        1172,   974,   803,   913,   662,   790,   741,   741,   591,
         627,   534,   524,   414,   368,   360,   404,   394,   287,
         404,   234,   264,   259,   212,   275,   268,   176,   144,
         128,   166,   103,   120,   137,    97,    90,    80,    86,
         117,    71,    59,    99,    62,   142,    80,    85,    59,
          77,    76,    95,   113,    70,    56,    55,    62,    55,
          22,    31,    57,    24,    15,    30,    28,    61,    19,
          12,    25,    26,    27,    30,    20,    40,    17,     7,
           0,     0,     9,     0,    13,    11,    27,     6,     0,
           0,     6,    13,     0,     0,    12,    23,     6,    32,
          12,    21,    21,     2,     0,     9,    12,    18,    23,
           4,    17,