<a href="https://colab.research.google.com/github/yc386/orthrus_metaproteomics/blob/main/orthrus_v110/orthrus_v110_pt3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Orthrus pt3- [`Mokapot`](https://github.com/wfondrie/mokapot)**

Runtime recommendation:
1. Normal CPU
2. CPU, high RAM (50GB)
3. **TPU** runtime if RAM usage is expected to be extremely high (many PSMs)

**Ignore the prompt to click restart !!!** Wait and the runtime will restart automatically.

In [None]:
#@title install dependencies, will automatically restart to sort out versions conflicts caused by Mokapot
!pip install mokapot xgboost
!pip uninstall -y numpy
!pip install numpy==1.24.1
import os
os.kill(os.getpid(), 9)

Collecting mokapot
  Downloading mokapot-0.10.0-py3-none-any.whl.metadata (4.5 kB)
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting lxml>=4.6.2 (from mokapot)
  Downloading lxml-5.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting triqler>=0.6.2 (from mokapot)
  Downloading triqler-0.8.0-py3-none-any.whl.metadata (11 kB)
Collecting importlib-metadata>=5.1.0 (from mokapot)
  Downloading importlib_metadata-8.6.1-py3-none-any.whl.metadata (4.7 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Collecting job-pool<0.3.0,>=0.2.6 (from triqler>=0.6.2->mokapot)
  Downloading job_pool-0.2.6-py3-none-any.whl.metadata (2.1 kB)
Collecting numpy>=1.18.1 (from mokapot)
  Downloading numpy-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Found existing installation: numpy 2.1.3
Uninstalling numpy-2.1.3:
  Successfully uninstalled numpy-2.1.3
Collecting numpy==1.24.1
  Downloading numpy-1.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
triqler 0.8.0 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.1 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.1 which is incompatible.
tensorflow-tpu 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.1 which is incompatible.[0m[31m
[0mSuccessfully installed 

In [1]:
#@title Add inputs for `Mokapot` -> click `Runtime` -> `Run cell and below`
peak_folder = "" #@param {type:"string"}
#@markdown - a folder contains relevant `.sage.tsv` results
joint_modelling= True #@param {type:"boolean"}
#@markdown - a joint model for low abundance samples, unclick for a separate model per experiment
default_Percolator=True #@param {type:"boolean"}
#@markdown - Python implementation of the Percolator SVM model


In [None]:
#@title Brew Mokapot

import mokapot
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import glob

"""
XGBoost schema from Fondrie & Noble (2021).
A non-linear XGBoost seems to be better for rescoring open search results.
"""

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import os


grid = {
    "scale_pos_weight": np.logspace(0, 2, 3),
    "max_depth": [1, 3, 6],
    "min_child_weight": [1, 10, 100],
    "gamma": [0, 1, 10],
}


xgb_mod = GridSearchCV(
    XGBClassifier(),
    param_grid=grid,
    n_jobs=1,
    cv=3,
    scoring="roc_auc",
)

"""Recursively find all .pin files in the given folder."""
def get_all_pin_files(folder_path):
    psm_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.pin'):
                full_path = os.path.join(root, file)
                psm_files.append(full_path)
    return psm_files


folder_path = peak_folder

if joint_modelling:
    psm_files = get_all_pin_files(peak_folder)

    if default_Percolator:
        svm = mokapot.PercolatorModel()
        psm_list = mokapot.read_pin(psm_files)
        results, models = mokapot.brew(psm_list, svm)
        result_files = results.to_txt(peak_folder)
    else:
        mod = mokapot.Model(xgb_mod)
        psm_list = mokapot.read_pin(psm_files)
        results, models = mokapot.brew(psm_list, mod)
        result_files = results.to_txt(peak_folder)

else:
    big_folder = sorted(glob.glob(f"{folder_path}/*"))

    for folder in big_folder:
        if not os.path.isdir(folder):
            continue

        print(f"Processing folder: {folder}")
        pin_files = glob.glob(f"{folder}/*.pin")

        if not pin_files:
            print(f"No .pin files found in {folder}. Skipping...")
            continue

        pin = pin_files[0]

        if default_Percolator:
            svm = mokapot.PercolatorModel()
            psm_list = mokapot.read_pin(pin)
            results, models = mokapot.brew(psm_list, svm)
            result_files = results.to_txt(folder)
        else:
            mod = mokapot.Model(xgb_mod)
            psm_list = mokapot.read_pin(pin)
            results, models = mokapot.brew(psm_list, mod)
            result_files = results.to_txt(folder)



  yield psms.apply(pd.to_numeric, errors="ignore")
