In [2]:
import numpy as np # for Array 
import pandas as pd # for DataFrames

import matplotlib.pyplot as plt # to plot

# Analysis MTL-Approach

In this analysis, we examine the drugs with the greatest differences in MSE and investigate how these differences correlate with the number of interactions.
- Does a high number of interactions influence the median MSE?
----

### Which drugs have the highest median MSE values?

In [None]:
# Load MSE files with baseline and naive approach
task_mses = np.loadtxt("task_mses.csv", delimiter=",", skiprows=1)  # skip header
task_mses_gene = np.loadtxt("task_mses_gene.csv", delimiter=",", skiprows=1)  # skip header
task_mses_pathway = np.loadtxt("task_mses_pathway.csv", delimiter=",", skiprows=1)  # skip header
task_mses_combination = np.loadtxt("task_mses_combination.csv", delimiter=",", skiprows=1)  # skip header

# Get index over drug names
gdsc_dataset = pd.read_csv('/sybig/home/tmu/TUGDA/data/GDSCDA_fpkm_AUC_all_drugs.zip', index_col=0)
drug_list = gdsc_dataset.columns[1780:]

In [None]:
# Create an DataFrame with all results
df_mse = pd.DataFrame({
    "Drug": drug_list,
    "MSE_baseline": task_mses,
    "MSE_gene": task_mses_gene,
    "MSE_pathway": task_mses_pathway,
    "MSE_combination": task_mses_combination
})

# Set "Drug"-column as index 
df_mse.set_index("Drug", inplace=True)

In [None]:
# Sort each column with Drugs with highest MSE 
for col in df_mse.columns:
    df_mse = df_mse.sort_values(by=col, ascending=False)

In [None]:
top15 = df_mse.head(15)
print(top15)

In [None]:
# Anzahl der Top-N Drugs pro Ansatz
top_n = 15

# 1. Finde die Top-N Drugs mit höchstem MSE pro Ansatz
top_drug_sets = []
for col in df_mse.columns:
    top_drugs = df_mse[col].sort_values(ascending=False).head(top_n).index
    top_drug_sets.append(set(top_drugs))

# 2. Schnittmenge aller Top-N-Drogensätze
common_top_drugs = set.intersection(*top_drug_sets)
print(f"Gemeinsame Top-{top_n}-Drugs über alle Ansätze: {len(common_top_drugs)} gefunden")

# 3. Filtere den DataFrame auf diese Drugs
df_common = df_mse.loc[common_top_drugs]

# 4. Sortiere nach einem festen Kriterium (z. B. Mittelwert)
df_common = df_common.loc[df_common.mean(axis=1).sort_values(ascending=False).index]

# 5. Plotten
df_common.plot(kind="bar", figsize=(14, 6))
plt.title(f"MSE comparison for common Top-{top_n} Drugs")
plt.ylabel("MSE")
plt.xlabel("Drug")
plt.xticks(rotation=45, ha="right")
plt.legend(title="Approach")
plt.tight_layout()
plt.show()


### How many interactions do the drugs with the highest MSE have?

In [4]:
dgi_matrix = pd.read_csv("./data/global_gene_interaction_matrix.csv", index_col=0).astype(np.float32)
pathway_matrix = pd.read_csv("./data/drug_pathway_binary_matrix.csv", index_col=0).astype(np.float32)

FileNotFoundError: [Errno 2] No such file or directory: './data/global_gene_interaction_matrix.csv'

In [None]:
# Number of interactions per drug (rows)
dgi_interaction_counts = dgi_matrix.sum(axis=1)
pathway_interaction_counts = pathway_matrix.sum(axis=1)

# Add interactions as new columns to the MSE DataFrame
df_mse["DGI_interactions"] = dgi_interaction_counts
df_mse["Pathway_interactions"] = pathway_interaction_counts

In [None]:
# Plot (Number of interactions vs. mse)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 5))
plt.scatter(df_mse["DGI_interactions"], df_mse["MSE_gene"], alpha=0.7)
plt.xlabel("Number of Gene Interactions (DGI)")
plt.ylabel("MSE (Gene-based Model)")
plt.title("Gene Interactions vs. MSE (MSE_gene)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(df_mse["Pathway_interactions"], df_mse["MSE_pathway"], alpha=0.7, color="orange")
plt.xlabel("Number of Pathway Assignments")
plt.ylabel("MSE (Pathway-based Model)")
plt.title("Pathway Interactions vs. MSE (MSE_pathway)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Analysis DA-Approach

In [3]:
# Load each DataFrame with baseline and naive approach
preds_AUC = pd.read_csv('./results/preds_AUC.csv', index_col=0)
preds_AUC_gene = pd.read_csv('./results/preds_AUC_naiv_gene_level.csv', index_col=0)
preds_AUC_pathway = pd.read_csv('./results/preds_AUC_naiv_pathway_level.csv', index_col=0)
preds_AUC_combination = pd.read_csv('./results/preds_AUC_naiv_combination.csv', index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: './results/preds_AUC.csv'

In [None]:
# Add code for correlation 