In [2]:
from koinapy import Koina
from pyteomics import fasta, parser
import pandas as pd

In [3]:
with open("models.txt") as f:
    models = [x.strip() for x in f.readlines()]

In [4]:
peptides = []
with fasta.read("20231117_UP000005640_9606.fasta") as f:
    for description, sequence in f:
        peptides += list(parser.cleave(sequence, parser.expasy_rules['trypsin'], missed_cleavages=0))

inputs = pd.DataFrame()  
inputs['peptide_sequences'] = list(set(peptides)) 
inputs['precursor_charges'] = 2
inputs['collision_energies'] = 25.0
inputs['instrument_types'] = "QE"
inputs['fragmentation_types'] = "HCD"

seq_lengths = inputs["peptide_sequences"].str.len()

# filter = (seq_lengths > 6) & (seq_lengths < 31) & ~(inputs["peptide_sequences"].str.contains("X")) & ~(inputs["peptide_sequences"].str.contains("U"))
# filter = (seq_lengths == 7) & ~(inputs["peptide_sequences"].str.contains("X")) & ~(inputs["peptide_sequences"].str.contains("U"))
filter = (seq_lengths == 15) & ~(inputs["peptide_sequences"].str.contains("X")) & ~(inputs["peptide_sequences"].str.contains("U"))
# filter = (seq_lengths == 30) & ~(inputs["peptide_sequences"].str.contains("X")) & ~(inputs["peptide_sequences"].str.contains("U"))
inputs = inputs[filter]
inputs.sort_values("peptide_sequences", key= lambda col: col.str.len(), inplace=True)
inputs = inputs.sample(100)

In [5]:
for model_name in models:
    dict_inputs = {
        input_field: inputs[input_field].to_numpy().reshape(-1, 1)
        for input_field in Koina(model_name).model_inputs.keys()
    }

    if 'TMT' in model_name:
        dict_inputs["peptide_sequences"] = "[UNIMOD:737]-" + dict_inputs["peptide_sequences"]

    jsonPayload = {
        "data": list(Koina._Koina__slice_dict(dict_inputs, 1))
    }

    for batch in jsonPayload["data"]:
        for name, input in batch.items():
            batch[name] = input.flatten().tolist()

    import json
    with open(f'inputs/{model_name}.json', 'w') as file:
        json.dump(jsonPayload, file, indent=2)

I0000 00:00:1742311354.435337  229165 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
