# File for Ecoli Sepsis

In [None]:
import pandas as pd

ecoli_data = pd.read_csv("../BINN_elin/data/ecoli_data.csv", sep=",")
#sps_data.head()
ecoli_data

In [None]:
pathways = pd.read_csv("../BINN_elin/data/pathway.csv", sep=",")
pathways

In [None]:
translation = pd.read_csv("data/translation.csv", sep=",")
translation
#translation.shape

In [None]:
from  binn import Network
network_ecoli = Network( 
    input_data=ecoli_data,
    pathways=pathways,
    mapping=translation,
    input_data_column = "UniProt", # specify the column for entities in input data
    source_column = "source", # defined by our pathways-file
    target_column = "target",
)

In [None]:
from binn import BINN

binn_ecoli = BINN(
    network=network_ecoli,
    n_layers=2,
    dropout=0.2,
    validate=False,
    device="cpu",
    learning_rate=0.001,
    )
binn_ecoli.layers

In [None]:
binn_ecoli.trainable_params

In [None]:
layers = binn_ecoli.layer_names
layers[0][0]

In [None]:
design_matrix_ecoli = pd.read_csv("data/ecoli_design_matrix.csv", sep=',')
design_matrix_ecoli

In [None]:
from docs.util_for_examples import fit_data_matrix_to_network_input, generate_data
import torch
from lightning.pytorch import Trainer



In [None]:
print(network_ecoli.inputs[:10])
print(ecoli_data.head())


In [None]:
protein_matrix_ecoli = fit_data_matrix_to_network_input(ecoli_data, features=network_ecoli.inputs, feature_column="UniProt")

protein_matrix_ecoli.head()

In [None]:


X, y = generate_data(protein_matrix_ecoli, design_matrix=design_matrix_ecoli)
dataset = torch.utils.data.TensorDataset(
    torch.tensor(X, dtype=torch.float32, device=binn_ecoli.device),
    torch.tensor(y, dtype=torch.int16, device=binn_ecoli.device),
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# You can train using the Lightning Trainer
trainer = Trainer(max_epochs=10, log_every_n_steps=10)
#trainer.fit(binn_sps, dataloader)

In [None]:
from binn import Network, BINN
from binn.explainer import BINNExplainer

explainer_ecoli = BINNExplainer(binn_ecoli)

In [None]:
test_data = torch.Tensor(X)
background_data = torch.Tensor(X)

n_iterations = 20
n_epochs = 30

importance_df_ecoli, metrics_ecoli = explainer_ecoli.explain_average(
    test_data, background_data, n_iterations, n_epochs, dataloader, fast_train=True
)
metrics_ecoli

In [None]:
import torch.nn.functional as F

# You can also train with a standard PyTorch train loop 

optimizer = binn_ecoli.configure_optimizers()[0][0]

num_epochs = 30

for epoch in range(num_epochs):
    binn_ecoli.train() 
    total_loss = 0.0
    total_accuracy = 0

    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(binn_ecoli.device)
        targets = targets.to(binn_ecoli.device).type(torch.LongTensor)
        optimizer.zero_grad()
        outputs = binn_ecoli(inputs).to(binn_ecoli.device)
        loss = F.cross_entropy(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_accuracy += torch.sum(torch.argmax(outputs, axis=1) == targets) / len(targets)

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    print(f'Epoch {epoch}, Average Accuracy {avg_accuracy}, Average Loss: {avg_loss}')


In [None]:
from binn import BINNExplainer

explainer_ecoli = BINNExplainer(binn_ecoli)

In [None]:
test_data = torch.Tensor(X)
background_data = torch.Tensor(X)


importance_df_ecoli = explainer_ecoli.explain(test_data, background_data)

In [None]:
importance_df_ecoli

# Start working from here!!

In [None]:
importance_df_ecoli.to_csv('data/importance_df_ecoli_new.csv', sep = '\t', index=False)

In [None]:
import pandas as pd
importance_df_ecoli = pd.read_csv('data/importance_df_ecoli_new.csv', sep = '\t')
#importance_df_ecoli = pd.read_csv('data/importance_df_ecoli.csv', sep = '\t')
importance_df_ecoli.shape

In [None]:
from binn import ImportanceNetwork

IG_ecoli = ImportanceNetwork(importance_df_ecoli, norm_method="fan")

#IG_ecoli.importance_df.sort_values("value", ascending=False).head()

In [None]:
IG_ecoli.plot_complete_sankey(
    multiclass=False, node_cmap="coolwarm", edge_cmap="coolwarm"
)

In [None]:
source_proteome = pd.read_csv("data/source.csv", sep= ",")

source_mapping = (
    source_proteome.set_index("id").to_dict()["name"]
)

source_mapping.update({"root": "root"})


#source_mapping

In [None]:
plot_df = importance_df_ecoli.copy()

plot_df["copy"] = plot_df.apply(
    lambda x: True if x["source name"] == x["target name"] else False, axis=1
)
plot_df = plot_df[plot_df["copy"] == False]

plot_df["source name"] = plot_df["source name"].map(source_mapping)
plot_df["target name"] = plot_df["target name"].map(source_mapping)
plot_df

In [None]:
from binn import ImportanceNetwork
IG_plot = ImportanceNetwork(plot_df, norm_method="fan")

In [None]:
IG_plot.importance_df

In [None]:
IG_plot.plot_complete_sankey(
    multiclass=False, node_cmap="coolwarm", edge_cmap="coolwarm", savename='ecoli_sankey.png'
)

In [None]:
IG_plotplot_subgraph_sankey(query_node = "A1E959_0")

In [None]:
#IG_sps.importance_df.to_csv('data/imp_df_sps.csv', sep = '\t', index=False)

In [None]:
importance_df_copy = importance_df_ecoli.groupby(["source name", "source layer", "target layer"], as_index=False).mean(numeric_only=True)
mean_ranks = []
std_ranks = []
source_layer = []
sources = []
for layer in range(binn_ecoli.n_layers):
    layer_df = importance_df_copy[importance_df_copy["source layer"] == layer].copy()
    for i in range(n_iterations):
        layer_df.sort_values(f"value_{i}", ascending=False, inplace=True)
        layer_df[f"rank_{i}"] = range(len(layer_df.index))
    rank_cols = [c for c in layer_df.columns if c.startswith("rank")]
    mean_ranks += (layer_df[rank_cols].mean(axis=1)/ len(layer_df.index)).tolist()
    sources += layer_df["source name"].tolist()
    std_ranks  += (layer_df[rank_cols].std(axis=1)/ len(layer_df.index)).tolist()
    source_layer += layer_df["source layer"].tolist()
plot_df_robust = pd.DataFrame({"mean":mean_ranks, "std":std_ranks, "source layer":source_layer, "source":sources})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt



sns.jointplot(plot_df_robust, y="mean", x="std", hue="source layer", palette="coolwarm")

#plt.savefig("ecoli_robustness.png", dpi=500)


# Biomarkers

In [None]:
biomarkers = IG_plot.importance_df[IG_plot.importance_df["source layer"] == 0].copy()
biomarkers = biomarkers.groupby("source name").mean(numeric_only=True)
biomarkers = biomarkers.sort_values("value", ascending=False)["source"][
    0:10
].index.tolist()

In [None]:
#biomarkers.tofile('data/ecoli_biomarkers.csv', sep=',', format='%s')
biomarkers

In [None]:
import csv

# Open a CSV file in write mode
with open('data/ecoli_biomarkers_new.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    writer = csv.writer(csvfile, delimiter=',')
    
    # Write the biomarkers to the CSV file
    writer.writerow(biomarkers)


# Accuarcy and loss

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

all_metrics = []

for model, metric in metrics_ecoli.items():
    metric["model"] = [model] * len(metric["accuracies"])
    all_metrics.append(pd.DataFrame(metric))

all_metrics = pd.concat(all_metrics)
print(all_metrics)

fig, axs = plt.subplots(2, 1, figsize=(4, 6), sharex=True)
sns.lineplot(
    all_metrics,
    x="epoch",
    y="losses",
    hue="model",
    ax=axs[0],
    palette="coolwarm",
    alpha=0.5,
)
sns.lineplot(
    all_metrics,
    x="epoch",
    y="accuracies",
    hue="model",
    ax=axs[1],
    palette="coolwarm",
    alpha=0.5,
)
for ax in axs.ravel():
    sns.despine(ax=ax)
    ax.legend().remove()

plt.tight_layout()


axs[0].set_ylabel("Loss")
axs[1].set_ylabel("Accuracy")
axs[1].set_xlabel("Epoch")
plt.savefig("ecoli_acc_loss.png", dpi=300)