In [1]:
from importlib import reload
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import logging

logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

# Load data

In [3]:
maxquant_ss_path = (
    "/cmnfs/proj/ORIGINS/data/brain/txt_ssDDA_LFQ_noMBR/evidence_fresh1_RT_transfer.txt"
)
maxquant_offline_path = "/cmnfs/proj/ORIGINS/data/brain/txt_3x13Brainregions_MBR_LFQ_iBAQ/evidence_freshfrozen_modseq_charge_1_FilteredByClosestRT_transfer_RT_pred_filtered_withIso.pkl"
maxquant_ss = pd.read_csv(maxquant_ss_path, sep="\t", low_memory=False)
maxquant_offline = pd.read_pickle(maxquant_offline_path)

In [4]:
# for duplicate entry in maxquant_ss, keep only the highest intensity
logging.info("Entries before dropping duplicates: %d", maxquant_ss.shape[0])
maxquant_ss = maxquant_ss.sort_values("Intensity", ascending=False).drop_duplicates(
    subset=["Modified sequence", "Charge"], keep="first"
)
logging.info("Entries after dropping duplicates: %d", maxquant_ss.shape[0])

2024-06-12 09:20:28,500 - root - INFO - Entries before dropping duplicates: 37448
2024-06-12 09:20:28,560 - root - INFO - Entries after dropping duplicates: 35151


In [5]:
maxquant_offline_expRTrange = pd.merge(
    maxquant_offline,
    maxquant_ss[
        [
            "Modified sequence",
            "Charge",
            "Calibrated retention time",
            "Calibrated retention time start",
            "Calibrated retention time finish",
        ]
    ],
    on=["Modified sequence", "Charge"],
    suffixes=("", "_ss"),
    how="left",
)

In [6]:
import numpy as np

maxquant_offline_expRTrange.head()

Unnamed: 0,Sequence,Length,Modifications,Modified sequence,Oxidation (M) Probabilities,Oxidation (M) Score Diffs,Acetyl (Protein N-term),Oxidation (M),Missed cleavages,Proteins,...,score,seq,tr,predicted_RT,RT_diff,IsoMZ,IsoAbundance,Calibrated retention time_ss,Calibrated retention time start_ss,Calibrated retention time finish_ss
0,AAAAAAAAAAGAAGGR,16,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_,,,1,0,0,Q86U42,...,81.099,AAAAAAAAAAGAAGGR,14.533,4.788956,9.744044,"[620.823823525445, 621.322340973045, 621.32550...","[0.5139849492138241, 0.035696594727316275, 0.2...",,,
1,AAAAAAAAEQQSSNGPVKK,19,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAAEQQSSNGPVKK_,,,1,0,1,Q16585,...,100.07,AAAAAAAAEQQSSNGPVKK,9.2634,2.275833,6.987567,"[906.46649479848, 906.96501224608, 906.9681722...","[0.3737870030962582, 0.03279126900327132, 0.30...",,,
2,AAAAAAGAASGLPGPVAQGLK,21,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_,,,1,0,0,Q96P70,...,112.95,AAAAAAGAASGLPGPVAQGLK,18.447,12.156884,6.290116,"[895.992148446755, 896.4906658943551, 896.4938...","[0.3655935591351674, 0.03073612810857547, 0.31...",,,
3,AAAAAAGAGPEMVR,14,"Acetyl (Protein N-term),Oxidation (M)",_(Acetyl (Protein N-term))AAAAAAGAGPEM(Oxidati...,AAAAAAGAGPEM(1)VR,AAAAAAGAGPEM(140)VR,1,1,0,P28482,...,144.47,AAAAAAGAGPEMVR,10.368,3.991367,6.376633,"[650.819892482935, 651.3184099305349, 651.3215...","[0.47466145737226756, 0.02949549407920674, 0.2...",,,
4,AAAAAAGAGPEMVR,14,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAGAGPEMVR_,,,1,0,0,P28482,...,178.95,AAAAAAGAGPEMVR,13.092,6.385735,6.706265,"[642.822435172835, 643.320952620435, 643.32411...","[0.47581883447573764, 0.02956741356829228, 0.2...",12.552,12.493,12.631


In [7]:
maxquant_offline_expRTrange.head()
# Calculate the values to fill the NAs
fill_values_left = maxquant_offline_expRTrange["predicted_RT"].fillna(0) - 0.9
fill_values_right = maxquant_offline_expRTrange["predicted_RT"].fillna(0) + 0.9

# Fill the NAs in the last two columns
maxquant_offline_expRTrange[
    "Calibrated retention time start_ss"
] = maxquant_offline_expRTrange["Calibrated retention time start_ss"].fillna(
    fill_values_left
)
maxquant_offline_expRTrange[
    "Calibrated retention time finish_ss"
] = maxquant_offline_expRTrange["Calibrated retention time finish_ss"].fillna(
    fill_values_right
)
maxquant_offline_expRTrange[
    "Calibrated retention time_ss"
] = maxquant_offline_expRTrange["Calibrated retention time_ss"].fillna(
    maxquant_offline_expRTrange["predicted_RT"]
)
maxquant_offline_expRTrange.head()

Unnamed: 0,Sequence,Length,Modifications,Modified sequence,Oxidation (M) Probabilities,Oxidation (M) Score Diffs,Acetyl (Protein N-term),Oxidation (M),Missed cleavages,Proteins,...,score,seq,tr,predicted_RT,RT_diff,IsoMZ,IsoAbundance,Calibrated retention time_ss,Calibrated retention time start_ss,Calibrated retention time finish_ss
0,AAAAAAAAAAGAAGGR,16,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_,,,1,0,0,Q86U42,...,81.099,AAAAAAAAAAGAAGGR,14.533,4.788956,9.744044,"[620.823823525445, 621.322340973045, 621.32550...","[0.5139849492138241, 0.035696594727316275, 0.2...",,,
1,AAAAAAAAEQQSSNGPVKK,19,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAAEQQSSNGPVKK_,,,1,0,1,Q16585,...,100.07,AAAAAAAAEQQSSNGPVKK,9.2634,2.275833,6.987567,"[906.46649479848, 906.96501224608, 906.9681722...","[0.3737870030962582, 0.03279126900327132, 0.30...",,,
2,AAAAAAGAASGLPGPVAQGLK,21,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_,,,1,0,0,Q96P70,...,112.95,AAAAAAGAASGLPGPVAQGLK,18.447,12.156884,6.290116,"[895.992148446755, 896.4906658943551, 896.4938...","[0.3655935591351674, 0.03073612810857547, 0.31...",,,
3,AAAAAAGAGPEMVR,14,"Acetyl (Protein N-term),Oxidation (M)",_(Acetyl (Protein N-term))AAAAAAGAGPEM(Oxidati...,AAAAAAGAGPEM(1)VR,AAAAAAGAGPEM(140)VR,1,1,0,P28482,...,144.47,AAAAAAGAGPEMVR,10.368,3.991367,6.376633,"[650.819892482935, 651.3184099305349, 651.3215...","[0.47466145737226756, 0.02949549407920674, 0.2...",,,
4,AAAAAAGAGPEMVR,14,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAGAGPEMVR_,,,1,0,0,P28482,...,178.95,AAAAAAGAGPEMVR,13.092,6.385735,6.706265,"[642.822435172835, 643.320952620435, 643.32411...","[0.47581883447573764, 0.02956741356829228, 0.2...",12.552,12.493,12.631


Unnamed: 0,Sequence,Length,Modifications,Modified sequence,Oxidation (M) Probabilities,Oxidation (M) Score Diffs,Acetyl (Protein N-term),Oxidation (M),Missed cleavages,Proteins,...,score,seq,tr,predicted_RT,RT_diff,IsoMZ,IsoAbundance,Calibrated retention time_ss,Calibrated retention time start_ss,Calibrated retention time finish_ss
0,AAAAAAAAAAGAAGGR,16,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_,,,1,0,0,Q86U42,...,81.099,AAAAAAAAAAGAAGGR,14.533,4.788956,9.744044,"[620.823823525445, 621.322340973045, 621.32550...","[0.5139849492138241, 0.035696594727316275, 0.2...",4.788956,3.888956,5.688956
1,AAAAAAAAEQQSSNGPVKK,19,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAAEQQSSNGPVKK_,,,1,0,1,Q16585,...,100.07,AAAAAAAAEQQSSNGPVKK,9.2634,2.275833,6.987567,"[906.46649479848, 906.96501224608, 906.9681722...","[0.3737870030962582, 0.03279126900327132, 0.30...",2.275833,1.375833,3.175833
2,AAAAAAGAASGLPGPVAQGLK,21,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_,,,1,0,0,Q96P70,...,112.95,AAAAAAGAASGLPGPVAQGLK,18.447,12.156884,6.290116,"[895.992148446755, 896.4906658943551, 896.4938...","[0.3655935591351674, 0.03073612810857547, 0.31...",12.156884,11.256884,13.056884
3,AAAAAAGAGPEMVR,14,"Acetyl (Protein N-term),Oxidation (M)",_(Acetyl (Protein N-term))AAAAAAGAGPEM(Oxidati...,AAAAAAGAGPEM(1)VR,AAAAAAGAGPEM(140)VR,1,1,0,P28482,...,144.47,AAAAAAGAGPEMVR,10.368,3.991367,6.376633,"[650.819892482935, 651.3184099305349, 651.3215...","[0.47466145737226756, 0.02949549407920674, 0.2...",3.991367,3.091367,4.891367
4,AAAAAAGAGPEMVR,14,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAGAGPEMVR_,,,1,0,0,P28482,...,178.95,AAAAAAGAGPEMVR,13.092,6.385735,6.706265,"[642.822435172835, 643.320952620435, 643.32411...","[0.47581883447573764, 0.02956741356829228, 0.2...",12.552,12.493,12.631


In [8]:
maxquant_offline_expRTrange.to_pickle(
    "/cmnfs/proj/ORIGINS/data/brain/txt_3x13Brainregions_MBR_LFQ_iBAQ/evidence_freshfrozen_modseq_charge_1_FilteredByClosestRT_transfer_RT_pred_filtered_withIso_expRTrange.pkl"
)