## Installation:

### Percolator:
- To install percolator on windows download this file: https://github.com/percolator/percolator/releases/download/rel-3-06-01/percolator-v3-06.exe
- Run the downloaded file while running the setup make sure to select "add percolator to the system PATH for all users" when asked.

### ThermoRawFileParser:
- You need this if you want to read thermo rawfiles.
- First download this zip folder localy: https://github.com/compomics/ThermoRawFileParser/releases/download/v1.4.3/ThermoRawFileParser1.4.3.zip
- Extract the contents of the zip folder and make sure to know where this is saved this will be used later by oktoberfest.

### Oktoberfest:
- Oktoberfest currenty support Python version 3.9 and 3.10 so please install one of these python versions.
- Install oktoberfest using pip install oktoberfest

### Site Annotation:
- Install site annotation package using pip install psite-annotation

## 1- Rescoring

In [None]:
from oktoberfest.runner import run_job
from oktoberfest import __version__ as version
import json
from tqdm.auto import tqdm

In [3]:
spectra =   # this is the directory of the mzML/rawfiles containing the measured spectra.
spectra_type =  # this is the format the spectra are provided in ("mzml", "raw", "d")

search_results =                             # this is the location of the search engine output
search_results_type =  # this is the search engine type ('maxquant', 'MsFragger')

In [3]:
intensity_model = "Prosit_2024_intensity_cit"                              # this is the model used for fragment intensity prediction, e.g. "some model"
retention_time_model = "Prosit_2024_irt_cit"                                   # this is the model used for retention time prediction, e.g. "some model"
prediction_server = "koina.wilhelmlab.org:443"                             # the Koina server that provides access to the specified models, e.g. "<url>:<port number>"

output_directory =                               # here you can sepcify your output directory

In [5]:
 # this is the local folder where you ThermoRawFileParser.exe file is e.g 'extracted_ThermoRawFileParser/ThermoRawFileParser.exe'
thermo_exe_directory= 

- Documentation for the different parameters in the config file can be found here:
https://oktoberfest.readthedocs.io/en/stable/config.html

In [9]:
task_config_rescoring = {
    "type": "Rescoring",
    "tag": "",
    "inputs":{
        "search_results": search_results,
        "search_results_type": search_results_type,
        "spectra": spectra,
        "spectra_type": spectra_type
    },
    "output": output_directory,
    "models": {
        "intensity": intensity_model
        , "irt": retention_time_model
    },
    "prediction_server": prediction_server,
    "ssl": True,
    "thermoExe": thermo_exe_directory,
    "numThreads": 1,
    "fdr_estimation_method": "percolator",
    "regressionMethod": "spline",
    "allFeatures": False,
    "pipeline": "cit",
    "ptm_localization": True,
    "ptmLocalizationOptions": {
        "unimod_id": 7,
        "possible_sites": ['R','N','Q'],
        "neutral_loss": True

    }
}

with open('./rescoring_config.json', 'w') as fp:
    json.dump(task_config_rescoring, fp)

In [None]:
run_job("./rescoring_config.json")

## 2- Site annotation

In [1]:
import pandas as pd
import psite_annotation as pa
import re

#### A. Extract SA scores from RESCORE TAB and combine with PROSIT-CIT PSMS Results

In [None]:
# Load spectral_angle and SpecId from the RESCORE TAB file
combined_df = pd.read_csv(   
output_directory + "/results/percolator/localize_mod/rescore.tab",
    sep='\t', 
    usecols=["spectral_angle", "SpecId"]
)

# Load Prosit-Cit psms results
df_prosit_psms = pd.read_csv(
    output_directory + "/results/percolator/localize_mod/rescore.percolator.psms.txt",
    sep='\t'
)

#Remove _ appended to the peptide sequence
df_prosit_psms['peptide'] = df_prosit_psms['peptide'].str.replace(r'\._|_\.', '', regex=True)

# Rename the first column to "SpecId"
df_prosit_psms.rename(columns={"PSMId": "SpecId"}, inplace=True)

# Filter for cit peptides with 1%FDR
df_prosit_psms = df_prosit_psms[
    (df_prosit_psms['q-value'] <= 0.01) & 
    (df_prosit_psms['peptide'].str.contains(r"R\[UNIMOD:7\]"))
]

# Merge Prosit-Cit psms results with spectral_angle data
merged_df = pd.merge(df_prosit_psms, combined_df, on="SpecId", how="left")

#### B. Map to the cit site/sequence window using the corresponding fasta

In [53]:
Fasta_file_path = #Add here the path to the fasta file used for search

In [None]:
# Remove contaminants proteins if existing
merged_df['Organism'] = merged_df['proteinIds'].apply(lambda x: x.split(':')[0].split('_')[-1])
main_organism = merged_df['Organism'].value_counts().index[0]
merged_df = merged_df[merged_df['proteinIds'].str.contains(r'_'+ main_organism, regex=True)]

# Select relevant columns
merged_df = merged_df[['filename', 'peptide', 'proteinIds', 'spectral_angle']]

# Extract protein name from 'protein' column using regex
merged_df['proteinIds'] = merged_df['proteinIds'].apply(lambda x: re.sub(r'^.*\|(.*?)\|.*$', r'\1', x))

# Pivot the DataFrame
dt = merged_df.pivot_table(index=['peptide', 'proteinIds'], columns='filename', values='spectral_angle').reset_index()
dt.rename(columns={'proteinIds': 'Proteins', 'peptide': 'Modified sequence'}, inplace=True)

# Replace specific modification patterns in 'Modified Sequence'
mod_replacements = {
    r"C\[UNIMOD:4\]": "C",
    r"M\[UNIMOD:35\]": "M",
    r"N\[UNIMOD:7\]": "N",
    r"Q\[UNIMOD:7\]": "Q"
}

# Apply all replacements
for pattern, replacement in mod_replacements.items():
    dt['Modified sequence'] = dt['Modified sequence'].str.replace(pattern, replacement, regex=True)

# Load Fasta used for search
CustomFasta = Fasta_file_path
dt = pa.addPeptideAndPsitePositions(dt, CustomFasta, mod_dict={'R[UNIMOD:7]': 'r'})
dt = dt[dt['Site positions'] != ""]

dt.to_csv(output_directory +"/Cit_rescore_site_mapping.txt", sep='\t', index=False)