In [1]:
import os
import importlib
from timeit import timeit
import logging
import sys
import importlib
import time
import multiprocessing as mp
import multiprocessing.pool

import numpy as np
import pandas as pd
import scipy
import h5py
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numba
import sklearn.linear_model

import ms_utils
import browser
import interface
import ms_database
import ms_run_files

import sandbox

import line_profiler
profile = line_profiler.LineProfiler()
# heat.evolve = profile(heat.evolve)
# profile.print_stats()

def reload():
    importlib.reload(ms_run_files)
    importlib.reload(ms_utils)
    importlib.reload(browser)
    importlib.reload(interface)
    importlib.reload(sandbox)
    importlib.reload(ms_database)
   

In [2]:
evi = ms_run_files.HDF_Evidence_File("/home/sander/Documents/Proteomics/data/ecoli/28Oct2016_060.inet.csv")
inet = evi.ion_network

In [None]:
positive_edges = evi.get_edges()
negative_edges = evi.get_edges(positive=False)

In [None]:
indptr, indices, edge_pointers = inet.get_edges(symmetric=True, return_pointers=True)

In [4]:
evi.create_mgf(
    {
        "log_file_name": ".",
        "output_directory": ".",
        "force_overwrite": True,
        "minimum_peaks": 5,
        "edge_threshold": "2 * (positive_edges - negative_edges) > evidence_run_count"
    }
)

In [None]:
@numba.njit
def cluster(
    indptr,
    indices,
    selected_edges,
    edge_pointers,
):
    node_count = indptr.shape[0] - 1
    clusters = np.zeros(node_count, np.int64)
    cluster_number = 0
    for index in range(node_count):
        if clusters[index] != 0:
            continue
        current_cluster = set()
        new_indices = set()
        new_indices.add(index)
        while len(new_indices) > 0:
            new_index = new_indices.pop()
            current_cluster.add(new_index)
            neighbors = indices[indptr[new_index]: indptr[new_index + 1]]
            pointers = edge_pointers[indptr[new_index]: indptr[new_index + 1]]
            selected = selected_edges[pointers]
            new_indices |= set(neighbors[selected]) - current_cluster
        cluster_number += 1
        for i in current_cluster:
            clusters[i] = cluster_number
    return clusters

In [None]:
clusters = cluster(
    indptr,
    indices,
    (positive_edges - negative_edges) > 5,
#     positive_edges==evi.run_count,
    edge_pointers,
)
cluster_sizes = np.bincount(clusters)

In [None]:
( positive_edges==evi.run_count).shape, indices.shape, edge_pointers.shape[0]/2

In [None]:
%matplotlib notebook

plt.plot(np.cumsum(np.bincount(cluster_sizes)[::-1])[::-1], marker=".")

In [None]:
def write_to_mgf(
    inet,
    clusters,
    file_name,
    minsize,
    expand,
    indptr,
    indices,
    selected_edges,
    edge_pointers,
):
    cluster_indices = np.argsort(clusters)
    cluster_indptr = np.empty(np.max(clusters + 2), np.int64)
    cluster_indptr[0] = 0
    cluster_indptr[1:] = np.cumsum(np.bincount(clusters))
    mzs, ints, rts = inet.get_ion_coordinates(["FRAGMENT_MZ", "FRAGMENT_LOGINT", "PRECURSOR_RT"])
    with open(file_name, "w") as infile:
        for cluster_index in np.flatnonzero(np.diff(cluster_indptr) > 10):
            infile.write("BEGIN IONS\n")
            cluster = cluster_indices[cluster_indptr[cluster_index]: cluster_indptr[cluster_index + 1]]
            if expand:
                cluster = expand_cluster(
                    cluster,
                    indptr,
                    indices,
                    selected_edges,
                    edge_pointers,
                )
            local_mzs = np.round(mzs[cluster], 4)
            local_ints = np.round(2**ints[cluster], 2)
            local_rts = rts[cluster]
#             infile.write(
#                 f"TITLE=cluster_index_{cluster_index}_size_{cluster.shape[0]}\n"
#             )
            infile.write(
                f"TITLE=cluster_index.{cluster_index}.{cluster_index}. "
                f"File=\"{inet.file_name}\" "
                f"NativeID:\"sample=1 period=1 cycle={cluster_index-1} experiment=1\"\n"
            )
            infile.write(
                f"RTINSECONDS={np.round(np.average(local_rts) * 60, 2)}\n"
            )
            infile.write("PEPMASS=1000\n")
            infile.write("CHARGE=2+\n")
            mz_order = np.argsort(local_mzs)
            for i in mz_order:
                infile.write(f"{local_mzs[i]} {local_ints[i]}\n")
            infile.write("END IONS\n")

# @numba.njit
def expand_cluster(
    cluster,
    indptr,
    indices,
    selected_edges,
    edge_pointers,
):
    new_indices = [cluster]
    for index in cluster:
        neighbors = indices[indptr[index]: indptr[index + 1]]
        pointers = edge_pointers[indptr[index]: indptr[index + 1]]
        selected = selected_edges[pointers]
        new_indices.append(neighbors[selected])
    return np.unique(np.concatenate(new_indices))

In [None]:
write_to_mgf(
    inet=inet,
    clusters=clusters,
    file_name="/home/sander/Documents/Sandbox/test_msfragger/test.mgf",
    minsize=5,
    expand=False,
    indptr=indptr,
    indices=indices,
    selected_edges=(positive_edges - negative_edges) > 3,
    edge_pointers=edge_pointers,
)

In [None]:
db = ms_database.HDF_Database_File(
    "/home/sander/Documents/Proteomics/data/databases/crap_ecoli_concatenated_decoy.hdf"
)
sequences = db.read_dataset("sequence", "proteins")
proteins = db.read_dataset("protein", "proteins")
with open("/home/sander/Documents/Sandbox/test_msfragger/ecoli_with_decoy.fasta", "w") as infile:
    for seq, prot in zip(sequences, proteins):
        infile.write(f">{prot}\n")
        infile.write(f"{seq}\n")

In [None]:
pmzs = inet.get_ion_coordinates("PRECURSOR_MZ")

In [None]:
pmzs[np.flatnonzero(clusters==988762)]

In [None]:
in_folder = "/home/sander/projects/tenzer"
evis = {}
inets = {}
for file_name in sorted(os.listdir(in_folder)):
    if file_name.endswith(".evidence.hdf"):
        in_file_name = os.path.join(in_folder, file_name)
        evi = ms_run_files.Evidence(in_file_name)
        run_name = evi.run_name
        evis[run_name] = evi
        inet = evi.ion_network
        inets[run_name] = inet

self_run = sorted(inets)[0]
self_inet = inets[self_run]
self_evi = evis[self_run]

In [None]:
# reproducibility_counts = self_evi.get_nodes()
# alignment_indices = [
#      np.flatnonzero(reproducibility_counts == i) for i in range(1 + self_evi.run_count)
# ]
# alignments = [
#     np.empty(
#         (alignment_indices[i].shape[0], i), np.int64
#     ) for i in range(1 + self_evi.run_count)
# ]
# alignment_masks = [
#     np.empty(
#         (alignment_indices[i].shape[0], i), np.bool_
#     ) for i in range(1 + self_evi.run_count)
# ]

alignment_matrix = np.empty(
    (self_inet.node_count, self_evi.run_count), np.int64
)
alignment_matrix_mask = np.zeros(
    (self_inet.node_count, self_evi.run_count), np.bool_
)
alignment_matrix_intensities = np.zeros(
    (self_inet.node_count, self_evi.run_count), np.float64
)

for index, (other_run, other_evi) in enumerate(sorted(evis.items())[1:]):
    self_ali = self_evi.get_nodes(other_evi)
    other_ali = other_evi.get_nodes(self_evi)
    alignment_matrix[self_ali, index] = other_ali
    alignment_matrix_mask[self_ali, index] = True
    alignment_matrix_intensities[self_ali, index] = 2**other_evi.ion_network.get_ion_coordinates(
        "FRAGMENT_LOGINT",
        other_ali
    )

In [None]:
a_reproducibility = 1 + np.sum(alignment_matrix_mask[:, 1::2], axis=1)
b_reproducibility = np.sum(alignment_matrix_mask[:, ::2], axis=1)
total_reproducibility = a_reproducibility + b_reproducibility

b_intensities = np.sum(alignment_matrix_intensities[:, ::2], axis=1) / b_reproducibility
a_intensities = (
    2**self_inet.get_ion_coordinates(
        "FRAGMENT_LOGINT"
    ) + np.sum(alignment_matrix_intensities[:, 1::2], axis=1)
) /  a_reproducibility

logfcs = np.log2(a_intensities) - np.log2(b_intensities)
valid_logfcs = (a_reproducibility > 0) & (b_reproducibility > 0)

In [None]:
%matplotlib notebook

for i in range(2, 1 + 10):
    reps = logfcs[valid_logfcs & (total_reproducibility == i)]
    a, b = np.unique(np.round(reps, 1), return_counts=True)
    plt.plot(a, b / np.sum(b), marker=".")

plt.xlim((-2, 2))
plt.xlabel("LOGFC")
plt.ylabel("relative frequency")
plt.legend(range(2, 1 + 10))

In [None]:
ecoli = logfcs < -0.5
yeast = logfcs > 0.2

In [None]:
left_indices, right_indices = self_inet.get_edges(
    return_as_pairs=True,
)
positive_counts = self_evi.get_edges()
negative_counts = self_evi.get_edges(positive=False)
left_overlaps = alignment_matrix_mask[left_indices]
right_overlaps = alignment_matrix_mask[right_indices]
overlaps = np.sum(left_overlaps&right_overlaps, axis=1)
logfc_deviations = np.abs(
    logfcs[left_indices] - logfcs[right_indices]
)

In [None]:
# evidence = (
#     positive_counts - negative_counts
# ) / (
#     positive_counts + negative_counts
# ) * positive_counts

evidence = positive_counts - negative_counts

In [None]:
overlap = 4
# int_filter = 1000
# logints = self_inet.get_ion_coordinates("FRAGMENT_LOGINT")
selected_edges = overlaps == overlap
# selected_edges &= (logints[left_indices] < int_filter)  & (logints[right_indices] < int_filter) 

selected_left_indices = left_indices[selected_edges]
selected_right_indices = right_indices[selected_edges]
selected_evidence = evidence[selected_edges]
selected_nan_values = np.isnan(selected_evidence)
selected_nan_values |= np.isinf(selected_evidence)
selected_evidence_values = np.unique(selected_evidence[~selected_nan_values])

selected_logfc_deviations = logfc_deviations[selected_edges]

In [None]:
%matplotlib notebook

color_mapper = matplotlib.cm.ScalarMappable(
    norm=matplotlib.colors.Normalize(
        vmin=np.min(selected_evidence_values),
        vmax=np.max(selected_evidence_values),
    ),
    cmap="RdYlGn"
)

random_right_indices = selected_right_indices.copy()
np.random.seed(100)
np.random.shuffle(random_right_indices)
random_logfc_deviation = np.abs(logfcs[selected_left_indices] - logfcs[random_right_indices])
nan_values = np.isnan(random_logfc_deviation)
nan_values |= np.isinf(random_logfc_deviation)
random_logfc_deviation = random_logfc_deviation[~nan_values]
plt.plot(
    np.percentile(random_logfc_deviation, range(101)),
    np.arange(101)/100,
    linestyle="dotted",
    c="black"
)

for evidence_value in selected_evidence_values:
    current = selected_evidence == evidence_value
    logfc_deviation = selected_logfc_deviations[current]
    nan_values = np.isnan(logfc_deviation)
    nan_values |= np.isinf(logfc_deviation)
    logfc_deviation = logfc_deviation[~nan_values]
    plt.plot(
        np.percentile(logfc_deviation, range(101)),# - np.percentile(random_logfc_deviation, range(101)),
        np.arange(101)/100,
        c=color_mapper.to_rgba(evidence_value)
    )
    
plt.axvline(
    np.median(random_logfc_deviation),
    linestyle="dotted",
    c="grey"
)
# plt.legend(list(np.round(evidence_values, 1)) + ["RANDOM"])
plt.colorbar(color_mapper)
plt.xlabel("ABSOLUTE LOGFC DEVIATION ON EDGE")
plt.ylabel("Relative frequency")
plt.title("EDGE EVIDENCE BETWEEN FULLY REPRODUCIBLE NODES")

In [None]:
%matplotlib notebook

color_mapper = matplotlib.cm.ScalarMappable(
    norm=matplotlib.colors.Normalize(
        vmin=np.min(0),
        vmax=np.max(self_evi.run_count),
    ),
    cmap="RdYlGn"
)

# for overlap in range(1, self_evi.run_count):
#     selected_edges = overlaps == overlap
if True:
    selected_edges = overlaps >= 1
    # selected_edges &= ecoli[left_indices] | ecoli[right_indices]

    selected_left_indices = left_indices[selected_edges]
    selected_right_indices = right_indices[selected_edges]
    selected_evidence = evidence[selected_edges]
    selected_nan_values = np.isnan(selected_evidence)
    selected_nan_values |= np.isinf(selected_evidence)
    selected_evidence_values = np.unique(selected_evidence[~selected_nan_values])


    ecoli_hit_rates = np.empty(selected_evidence_values.shape[0], np.float64)
    yeast_hit_rates = np.empty(selected_evidence_values.shape[0], np.float64)
    ecoli_random_rates = np.empty(selected_evidence_values.shape[0], np.float64)
    yeast_random_rates = np.empty(selected_evidence_values.shape[0], np.float64)
#     total_count = np.sum(selected_edges)
    for i, evidence_value in enumerate(selected_evidence_values):
        current = selected_evidence == evidence_value
        total_count = np.sum(current)
        
        left_ecoli = ecoli[selected_left_indices[current]]
        right_ecoli = ecoli[selected_right_indices[current]]
        ecoli_hit_rate = np.sum(left_ecoli & right_ecoli) / np.sum(left_ecoli | right_ecoli)
        ecoli_hit_rates[i] = ecoli_hit_rate
        random_right_ecoli = right_ecoli.copy()
        np.random.shuffle(random_right_ecoli)
        ecoli_random_rate = np.sum(left_ecoli & random_right_ecoli) / np.sum(left_ecoli | random_right_ecoli)
        ecoli_random_rates[i] = ecoli_random_rate
        
        left_yeast = yeast[selected_left_indices[current]]
        right_yeast = yeast[selected_right_indices[current]]
        yeast_hit_rate = np.sum(left_yeast & right_yeast) / np.sum(left_yeast | right_yeast)
        yeast_hit_rates[i] = yeast_hit_rate
        random_right_yeast = right_yeast.copy()
        np.random.shuffle(random_right_yeast)
        yeast_random_rate = np.sum(left_yeast & random_right_yeast) / np.sum(left_yeast | random_right_yeast)
        yeast_random_rates[i] = yeast_random_rate

    plt.plot(
        selected_evidence_values,
        ecoli_hit_rates,
        marker=".",
        c=color_mapper.to_rgba(overlap)
    )
    plt.plot(
        selected_evidence_values,
        ecoli_random_rates,
        marker=".",
        c=color_mapper.to_rgba(overlap),
        linestyle="dotted",
    )
    plt.plot(
        selected_evidence_values,
        yeast_hit_rates,
        marker=".",
        c=color_mapper.to_rgba(overlap)
    )
    plt.plot(
        selected_evidence_values,
        yeast_random_rates,
        marker=".",
        c=color_mapper.to_rgba(overlap),
        linestyle="dotted",
    )

# plt.axvline(
#     np.median(random_logfc_deviation),
#     linestyle="dotted",
#     c="grey"
# )
# # plt.legend(list(np.round(evidence_values, 1)) + ["RANDOM"])
plt.colorbar(color_mapper)
# plt.xlabel("ABSOLUTE LOGFC DEVIATION ON EDGE")
plt.ylabel("Relative frequency")
# plt.title("EDGE EVIDENCE BETWEEN FULLY REPRODUCIBLE NODES")