In [None]:
import os
import importlib
from timeit import timeit
import logging
import sys
import importlib
import time
import multiprocessing as mp
import multiprocessing.pool

import numpy as np
import pandas as pd
import scipy
import h5py
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numba
import sklearn.linear_model

import ms_utils
import browser
import interface
import ms_database
import ms_run_files

import sandbox

import line_profiler
profile = line_profiler.LineProfiler()
# heat.evolve = profile(heat.evolve)
# profile.print_stats()

def reload():
    importlib.reload(ms_run_files)
    importlib.reload(ms_utils)
    importlib.reload(browser)
    importlib.reload(interface)
    importlib.reload(sandbox)
    importlib.reload(ms_database)
   

In [None]:
def get_candidate_peptide_indices_for_nodes(
    inet,
    database,
    parameters
):
    # TODO: Docstring
    ms_utils.LOGGER.info(
        f"Writing node candidates to {inet.file_name}"
    )
    max_ppm = parameters["annotation_ppm"]
    self_mzs = inet.get_ion_coordinates("FRAGMENT_MZ")
    mz_order = np.argsort(self_mzs)
    database_mzs = database.get_fragment_coordinates("mz")
    low_limits = np.searchsorted(
        np.log(database_mzs) * 10**6,
        np.log(self_mzs[mz_order]) * 10**6 - max_ppm,
        "left"
    )
    high_limits = np.searchsorted(
        np.log(database_mzs) * 10**6,
        np.log(self_mzs[mz_order]) * 10**6 + max_ppm,
        "right"
    )
    inv_order = np.argsort(mz_order)
    return low_limits[inv_order], high_limits[inv_order]


@numba.njit(nogil=True, cache=True)
def score_annotations(
    candidates,
    edge_contributions,
    indptr,
    indices,
    peptide_pointers,
    low_peptide_indices,
    high_peptide_indices,
    peptide_count,
):
    annotated_ions = np.empty(candidates.shape[0], np.int64)
    annotated_peptides = np.empty(candidates.shape[0], np.int64)
    annotated_scores = np.empty(candidates.shape[0], np.float64)
    current_annotation_index = 0
    for index in np.flatnonzero(candidates):
        local_edge_contributions = edge_contributions[indptr[index]: indptr[index + 1]]
        good_edges = np.flatnonzero(local_edge_contributions > 0)
        if good_edges.shape[0] == 0:
            continue
        neighbors = indices[indptr[index]: indptr[index + 1]][good_edges]
        local_edge_contributions = local_edge_contributions[good_edges]
        l = low_peptide_indices[index]
        h = high_peptide_indices[index]
        candidate_peptides = peptide_pointers[l:h]
        candidate_peptide_scores = np.zeros(peptide_count, np.int64)
        for edge_contribution, neighbor in zip(local_edge_contributions, neighbors):
            l = low_peptide_indices[neighbor]
            h = high_peptide_indices[neighbor]
            if l == h:
                continue
            neighbor_peptides = peptide_pointers[l:h]
            candidate_peptide_scores[neighbor_peptides] += edge_contribution
        candidate_peptide_scores = candidate_peptide_scores[candidate_peptides]
        hits = np.flatnonzero(candidate_peptide_scores)
        if hits.shape[0] == 0:
            continue
        candidate_peptides = candidate_peptides[hits]
        candidate_peptide_scores = candidate_peptide_scores[hits]
    #     count_frequency = np.bincount(candidate_peptide_counts)
    #     if count_frequency[-1] != 1:
    #         continue
    #     print(index, candidate_peptides, candidate_peptide_counts)
        max_index = np.argmax(candidate_peptide_scores)
        peptide = candidate_peptides[max_index]
        score = candidate_peptide_scores[max_index]
        annotated_ions[current_annotation_index] = index
        annotated_peptides[current_annotation_index] = peptide
        annotated_scores[current_annotation_index] = score
        current_annotation_index += 1
    annotated_ions = annotated_ions[:current_annotation_index]
    annotated_peptides = annotated_peptides[:current_annotation_index]
    annotated_scores = annotated_scores[:current_annotation_index]
    return annotated_ions, annotated_peptides, annotated_scores

In [None]:
inet = ms_run_files.HDF_Network_File(
    "/home/sander/Documents/Proteomics/data/ecoli/28Oct2016_060.inet.hdf"
)
evi = ms_run_files.HDF_Evidence_File(inet)
database = ms_database.HDF_Database_File(
    "/home/sander/Documents/Proteomics/data/databases/crap_ecoli_concatenated_decoy.hdf"
)
parameters = ms_utils.read_parameters_from_json_file(default="annotation")

In [None]:
low_peptide_indices, high_peptide_indices = get_candidate_peptide_indices_for_nodes(
    inet,
    database,
    parameters
)
peptide_pointers = database.get_fragment_coordinates("peptide_index")
indptr, indices, edge_pointers = inet.get_edges(symmetric=True, return_pointers=True)
positive_counts = evi.get_edges()
negative_counts = evi.get_edges(positive=False)
edge_contributions = positive_counts[edge_pointers] - negative_counts[edge_pointers]
# edge_contributions = positive_counts == 9
peptide_sequences = database.read_dataset("sequence", "peptides")
protein_pointers = database.read_dataset("proteins", "peptides")
proteins = database.read_dataset("protein", "proteins")
decoys = np.array([protein.startswith("DECOY_") for protein in proteins])

In [None]:
# edge_contributions = positive_counts[edge_pointers] == 9
edge_contributions = positive_counts[edge_pointers] - negative_counts[edge_pointers]
np.bincount(edge_contributions + 10)

In [None]:
candidates = (high_peptide_indices > low_peptide_indices)
thread_count = 8
with multiprocessing.pool.ThreadPool(thread_count) as p:
    results = p.starmap(
        score_annotations,
        [
            (
                candidates[i::thread_count],
                edge_contributions,
                indptr,
                indices,
                peptide_pointers,
                low_peptide_indices,
                high_peptide_indices,
                peptide_sequences.shape[0],
            ) for i in range(thread_count)
        ]
    )
annotated_ions = np.concatenate([r[0] for r in results])
annotated_peptides = np.concatenate([r[1] for r in results])
annotated_scores = np.concatenate([r[2] for r in results])
annotated_protein_pointers = protein_pointers[annotated_peptides]

In [None]:
unique_proteins = [
    i for i, prot in enumerate(annotated_protein_pointers) if ";" not in prot
]
selected_annotated_ions = annotated_ions[unique_proteins]
selected_annotated_peptides = annotated_peptides[unique_proteins]
selected_annotated_scores = annotated_scores[unique_proteins]
selected_annotated_protein_pointers = annotated_protein_pointers[unique_proteins].astype(np.int64)
selected_annotated_decoys = decoys[selected_annotated_protein_pointers]

In [None]:
for i in range(int(np.max(selected_annotated_scores))):
    print(i, np.bincount(selected_annotated_decoys[selected_annotated_scores > i]))

In [None]:
np.bincount(selected_annotated_scores.astype(np.int))