# File for Severe vs Non Severe PNC

In [None]:
import pandas as pd

pnc_data = pd.read_csv("../BINN_elin/data/pnc_data.csv", sep=",")
translation = pd.read_csv("../BINN_elin/data/translation.csv", sep=",")
pathways = pd.read_csv("../BINN_elin/data/pathway.csv", sep=",")



In [None]:
#sps_data.head()
pnc_data.head()

In [None]:
pathways.head()
#sps_pathways.shape

In [None]:
translation.head()
#sps_pathways.shape

In [None]:
from  binn import Network
network_pnc = Network( 
    input_data=pnc_data,
    pathways=pathways,
    mapping=translation,
    input_data_column = "UniProt", # specify the column for entities in input data
    source_column = "source", # defined by our pathways-file
    target_column = "target"
)

In [None]:
from binn import BINN

binn_pnc = BINN(
    network=network_pnc,
    n_layers=2,
    dropout=0.2,
    validate=False,
    device="cpu",
    learning_rate=0.001,
    )
binn_pnc.layers

In [None]:
binn_pnc.trainable_params

In [None]:
layers = binn_pnc.layer_names
layers[0][0]

In [None]:
design_matrix_pnc = pd.read_csv("data/pnc_design_matrix.csv", sep=',')
#design_matrix_pnc

In [None]:
from docs.util_for_examples import fit_data_matrix_to_network_input, generate_data
import torch
from lightning.pytorch import Trainer



In [None]:
print(network_pnc.inputs[:10])
print(pnc_data.head())


In [None]:
protein_matrix_pnc = fit_data_matrix_to_network_input(pnc_data, features=network_pnc.inputs, feature_column="UniProt")

protein_matrix_pnc.head()

In [None]:


X, y = generate_data(protein_matrix_pnc, design_matrix=design_matrix_pnc)
dataset = torch.utils.data.TensorDataset(
    torch.tensor(X, dtype=torch.float32, device=binn_pnc.device),
    torch.tensor(y, dtype=torch.int16, device=binn_pnc.device),
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# You can train using the Lightning Trainer
trainer = Trainer(max_epochs=10, log_every_n_steps=10)
#trainer.fit(binn_sps, dataloader)

In [None]:
y

In [None]:
from binn import BINNExplainer

explainer_pnc = BINNExplainer(binn_pnc)

In [None]:
test_data = torch.Tensor(X)
background_data = torch.Tensor(X)

n_iterations = 20
n_epochs = 30

importance_df_pnc, metrics_pnc = explainer_pnc.explain_average(
    test_data, background_data, n_iterations, n_epochs, dataloader, fast_train=True
)


In [None]:
metrics_pnc

In [None]:
import torch.nn.functional as F

# You can also train with a standard PyTorch train loop 

optimizer = binn_pnc.configure_optimizers()[0][0]

num_epochs = 30

for epoch in range(num_epochs):
    binn_pnc.train() 
    total_loss = 0.0
    total_accuracy = 0

    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(binn_pnc.device)
        targets = targets.to(binn_pnc.device).type(torch.LongTensor)
        optimizer.zero_grad()
        outputs = binn_pnc(inputs).to(binn_pnc.device)
        loss = F.cross_entropy(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_accuracy += torch.sum(torch.argmax(outputs, axis=1) == targets) / len(targets)

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    print(f'Epoch {epoch}, Average Accuracy {avg_accuracy}, Average Loss: {avg_loss}')


In [None]:
test_data = torch.Tensor(X[5:10])
background_data = torch.Tensor(X[0:5])


importance_df_pnc = explainer_pnc.explain(test_data, background_data)

In [None]:
importance_df_pnc

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


sns.jointplot(plot_df, y="mean", x="std", hue="source layer", palette="coolwarm")

#plt.savefig("pnc_robustness.png", dpi=500)


# Start working from here!!

In [None]:
#importance_df_pnc.to_csv('data/importance_df_pnc_erik.csv', sep = '\t', index=False)

In [None]:
import pandas as pd
importance_df_pnc = pd.read_csv('data/importance_df_pnc_erik.csv', sep = '\t')
#importance_df_pnc = pd.read_csv('data/importance_df_pnc.csv', sep = '\t')
importance_df_pnc.head()

In [None]:
from binn import ImportanceNetwork

IG_pnc = ImportanceNetwork(importance_df_pnc, norm_method="fan")

#IG_pnc.importance_df.sort_values("value", ascending=False).head()

In [None]:
IG_pnc.plot_complete_sankey(
    multiclass=False, node_cmap="coolwarm", edge_cmap="coolwarm"
)

In [None]:
source_proteome = pd.read_csv("data/source.csv", sep= ",")

source_mapping = (
    source_proteome.set_index("id").to_dict()["name"]
)

source_mapping.update({"root": "root"})


#source_mapping

In [None]:
plot_df = importance_df_pnc.copy()
plot_df["source name"] = plot_df["source name"].map(source_mapping)
plot_df["target name"] = plot_df["target name"].map(source_mapping)
plot_df.head()

In [None]:
from binn import ImportanceNetwork
IG_plot = ImportanceNetwork(plot_df, norm_method="fan")

In [None]:
IG_plot.importance_df.head()

In [None]:
IG_plot.plot_complete_sankey(
    multiclass=False, node_cmap="coolwarm", edge_cmap="coolwarm", savename='pnc_sankey_erik.png'
)

In [None]:
IG_plotplot_subgraph_sankey(query_node = "A1E959_0")

In [None]:
#IG_sps.importance_df.to_csv('data/imp_df_sps_new.csv', sep = '\t', index=False)

# Robustness plot

In [None]:
importance_df_copy = importance_df_pnc.groupby(["source name", "source layer", "target layer"], as_index=False).mean(numeric_only=True)
mean_ranks = []
std_ranks = []
source_layer = []
sources = []
for layer in range(binn_pnc.n_layers):
    layer_df = importance_df_copy[importance_df_copy["source layer"] == layer].copy()
    for i in range(n_iterations):
        layer_df.sort_values(f"value_{i}", ascending=False, inplace=True)
        layer_df[f"rank_{i}"] = range(len(layer_df.index))
    rank_cols = [c for c in layer_df.columns if c.startswith("rank")]
    mean_ranks += (layer_df[rank_cols].mean(axis=1)/ len(layer_df.index)).tolist()
    sources += layer_df["source name"].tolist()
    std_ranks  += (layer_df[rank_cols].std(axis=1)/ len(layer_df.index)).tolist()
    source_layer += layer_df["source layer"].tolist()
plot_df = pd.DataFrame({"mean":mean_ranks, "std":std_ranks, "source layer":source_layer, "source":sources})

# Biomarkers

In [None]:
biomarkers = IG_plot.importance_df[IG_plot.importance_df["source layer"] == 0].copy()
biomarkers = biomarkers.groupby("source name").mean(numeric_only=True)
biomarkers = biomarkers.sort_values("value", ascending=False)["source"][
    0:10
].index.tolist()

In [None]:
#biomarkers.tofile('data/pnc_biomarkers.csv', sep=',', format='%s')
biomarkers

In [None]:
biomarkers = IG_pnc.importance_df[IG_pnc.importance_df["source layer"] == 0].copy()
biomarkers = biomarkers.groupby("source name").mean(numeric_only=True)
biomarkers = biomarkers.sort_values("value", ascending=False)["source"][
    0:100
].index.tolist()

biomarker_data = protein_matrix_pnc.loc[biomarkers].fillna(0)

from sklearn.preprocessing import StandardScaler
from umap import UMAP

cols = biomarker_data.columns
index = biomarker_data.index
biomarker_data = StandardScaler().fit_transform(biomarker_data.T).T

X_reduced = UMAP(random_state=42).fit_transform(biomarker_data.T)

row_colors = design_matrix_pnc["group"].map({1: "#398fcc", 2: "#c42412"}).values

biomarker_data = pd.DataFrame(data=biomarker_data, columns=cols, index=index)

g = sns.clustermap(
    biomarker_data,
    col_colors=row_colors,
    figsize=(5, 6),
    vmin=-2,
    vmax=2,
    cmap="coolwarm",
    cbar_kws={"label": "z-score"},
)
#g.ax_heatmap.set_xticks([])

In [None]:
import csv

# Open a CSV file in write mode
with open('data/pnc_biomarkers_new.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    writer = csv.writer(csvfile, delimiter=',')
    
    # Write the biomarkers to the CSV file
    writer.writerow(biomarkers)


In [None]:
umap_df = pd.DataFrame(data=X_reduced, columns=["UMAP1", "UMAP2"])
umap_df["group"] = design_matrix_pnc["group"].values
umap_df.reset_index(inplace=True)


plt.figure(figsize=(5,4))
sns.scatterplot(umap_df, x="UMAP1", y ="UMAP2", hue="group", palette=["#398fcc", "#c42412"])
plt.legend(frameon=False, title="Severity")
sns.despine()
plt.tight_layout()
plt.xticks([])
plt.yticks([])
#plt.savefig("poster_biomarkers_umap.png", dpi=500)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

all_metrics = []

for model, metric in metrics_pnc.items():
    metric["model"] = [model] * len(metric["accuracies"])
    all_metrics.append(pd.DataFrame(metric))

all_metrics = pd.concat(all_metrics)
print(all_metrics)

fig, axs = plt.subplots(2, 1, figsize=(4, 6), sharex=True)
sns.lineplot(
    all_metrics,
    x="epoch",
    y="losses",
    hue="model",
    ax=axs[0],
    palette="coolwarm",
    alpha=0.5,
)
sns.lineplot(
    all_metrics,
    x="epoch",
    y="accuracies",
    hue="model",
    ax=axs[1],
    palette="coolwarm",
    alpha=0.5,
)
for ax in axs.ravel():
    sns.despine(ax=ax)
    ax.legend().remove()

plt.tight_layout()


axs[0].set_ylabel("Loss")
axs[1].set_ylabel("Accuracy")
axs[1].set_xlabel("Epoch")
plt.savefig("pnc_acc_loss.png", dpi=300)