In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import vizta

Path("figures").mkdir(exist_ok=True)
pal = vizta.mpl.set_theme(context="poster", style="talusbio")

base_path = Path("../data/colab/ion-mobility-prediction")
pred_df = pd.read_parquet(base_path / "test.parquet")
orig_df = pd.read_csv("../data/ccs/meier_ccs.csv", index_col=0)
orig_df["Seq"] = (
    orig_df["Modified_sequence"]
    .str.replace("_(ac)", "[Acetyl]-", regex=False)
    .str.replace("M(ox)", "M[Oxidation]", regex=False)
    .str.replace("_", "", regex=False)
)
orig_df = orig_df.rename(columns={"label Prediction 0": "orig_pred"})

pred_df = pred_df.merge(orig_df.loc[:, ["Seq", "Charge", "orig_pred"]], how="right")
pred_df

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(13.02, 10.43))
ax = axs
with sns.color_palette("viridis"):
    im = ax.hexbin(
        pred_df["CCS"],
        pred_df["pred"], 
        mincnt=1, 
        gridsize=200, 
        bins="log", 
    )
    ax.axis('equal')

    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    ax.plot(lims, lims, zorder=0, linestyle="dashed", color=pal[0])
    ax.set_xlabel("Measured CCS ($\AA^2$)")
    ax.set_ylabel("Predicted CCS ($\AA^2$)")
    ax.grid(False)

plt.savefig("figures/ccs_corr.png", dpi=300, transparent=True)
    
fig, axs = plt.subplots(1, 2, figsize=(26, 10.43))
ax = axs[0]
err = pred_df["CCS"] - pred_df["pred"]
orig_err = pred_df["CCS"] - pred_df["orig_pred"]
sns.histplot(err, ax=ax)
ax.set_xlabel("Absolute Error ($\AA^2$)")
ax.set_ylabel("Number of Peptides")
ax.set_xlim(-200, 200)
ax.axvline(-50, linestyle="dashed", color=pal[0])
ax.axvline(50, linestyle="dashed", color=pal[0])

ax.grid(False)

ax = axs[1]
pred_rmse = pd.DataFrame({"rmse": np.sqrt(err**2), "Model": "Depthcharge\nTransformer"})
orig_rmse = pd.DataFrame({"rmse": np.sqrt(orig_err**2), "Model": "Meier et al, 2021\nLSTM"})
rmse = pd.concat([pred_rmse, orig_rmse])
sns.ecdfplot(data=rmse, x="rmse", hue="Model", ax=ax, stat="count") 
ax.set_xlabel("Maximum RMSE ($\AA^2$)")
ax.set_ylabel("Number of Peptides")
ax.grid(False)

plt.tight_layout()    
plt.savefig("figures/ccs.png", dpi=300, transparent=True)

In [None]:
print(len(err))
print((np.abs(err) < 50).sum() / len(err))

In [None]:
 pred_df["CCS"].describe()