In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import accuracy_score


sns.set_theme()


def separate(data, col, into, sep="_", **kwargs):
    return data.assign(
        **data.get(col).str.split(sep, expand=True, **kwargs)
        .rename(columns={i: x for i, x in enumerate(into)})
    )

## Load the data

As in the previous notebook, we first load the normalised counts from DESeq2 analysis:

In [None]:
df_norm = pd.read_csv("../Output/DESeq2/normalised_counts.csv", index_col=0)

and load the MIC values and we create additional columns for level of 
antibiotic resistence for both antibiotics (0 for low ABR and 1 for high ABR):

In [None]:
mic_threshold = 30
df_mic = (
    pd.read_csv("../Data/mic.csv", dtype={"strain": str})
    .assign(
        cza_mic_level=lambda x: (x.cza_mic > mic_threshold).astype(int),
        mem_mic_level=lambda x: (x.mem_mic > mic_threshold).astype(int)
    )
)

## Leave-one-out models

To test the generalizability of the models, we performed a leave-one-strain-out analysis, 
where each of the models was trained on the data from six strains (45 samples) and tested 
on the data from the sixth strain not included in the training (9 samples).

First, we transform the dataframe to the correcto format:

In [None]:
df_norm_rot = (
    df_norm
    .transpose()
    .reset_index(names="sample")
    .pipe(separate, "sample", ["strain", "condition", "replicate"], sep="_")
    .merge(df_mic, on=["strain", "condition"], how="left")
)
df_norm_rot.head()

As in the previous notebook, we fit the models to the data, but this time we leave data related to one strain per each model

In [None]:
strains = ["083.2", "090.3", "095.3", "678.3", "804.2", "816.3"]
meta_cols = ["sample", "strain", "condition", "replicate", "cza_mic", "mem_mic", "cza_mic_level", "mem_mic_level"]
models = {strain: PLSRegression(n_components=2) for strain in strains}

factors = {}

for strain, model in models.items():
    train_data = df_norm_rot.query("strain != @strain")
    x_train = train_data.drop(columns=meta_cols).to_numpy()
    y_train = train_data.mem_mic_level.to_numpy()
    model.fit(x_train, y_train)

    factors[strain] = model.transform(x_train, y_train)

We can combine all the factor values from each model in one dataframe:

In [None]:
df_pls = pd.concat([
    pd.DataFrame(factor_vals[0], columns=["Factor1", "Factor2"])
    .join(
        df_norm_rot
        .query("strain != @strain")
        .get(meta_cols)
        .reset_index(drop=True)
    ).assign(strain_left=strain)
    for strain, factor_vals in factors.items()
], ignore_index=True)
df_pls

and plot those below:

In [None]:
g = (
    sns.FacetGrid(
        df_pls.replace({"mem_mic_level": {0: "low", 1: "high"}}), 
        col="strain_left", 
        col_wrap=3
    )
    .map_dataframe(
        sns.scatterplot,
        x="Factor1",
        y="Factor2",
        style="condition", 
        hue="mem_mic_level",
    )
    .set(xlim=(-55, 55))
    .set_titles(col_template="{col_name}")
)

We calculate the accuracy scores for each model:

In [None]:
scores = {}
for strain, model in models.items():
    data = df_norm_rot.query("strain == @strain")
    x = data.drop(columns=meta_cols).to_numpy()
    y = data.mem_mic_level.to_numpy()
    y_pred = [1 if y > 0.5 else 0 for y in model.predict(x)]
    scores[strain] = accuracy_score(y, y_pred)

We convert the above dictionary to a dataframe:

In [None]:
df_accuracy = (
    pd.DataFrame
    .from_dict(scores, orient="index", columns=["accuracy"])
    .reset_index(names="strain_left")
)
df_accuracy

In [None]:
fig, ax = plt.subplots()
sns.boxplot(df_accuracy, y="accuracy")
sns.swarmplot(df_accuracy, y="accuracy", hue="strain_left")
sns.move_legend(ax, loc="center left", bbox_to_anchor=(0.9, 0.5))

**Tasks:** 
- can you determine what the top features have in common for these model?
- which features are in common between models?
- try to fit the other antibiotic