In [None]:
import numpy as np
import scipy.io
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import os
import pandas as pd


# Validation classification analysis

In [None]:
def load_mat_file(file_path, key):
    # Load the MATLAB file
    data = scipy.io.loadmat(file_path)
    # Assuming the labels or predictions are stored under the key 'labels'
    return data[key].squeeze()

def compute_metrics(true_labels, predicted_labels):
    # Calculate different metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    return accuracy, precision, recall, f1

def plot_confusion_matrix(true_labels, predicted_labels):
    # Generate confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    
    # Normalize the confusion matrix by the number of instances in each class (rows)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Create the plot
    fig, ax = plt.subplots()
    cax = ax.matshow(cm_normalized, cmap=plt.cm.Blues)
    
    # Add title and labels
    plt.title('Confusion Matrix (% Correct per Class)')
    fig.colorbar(cax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_xticks(np.arange(len(np.unique(true_labels))))
    ax.set_yticks(np.arange(len(np.unique(true_labels))))
    ax.set_xticklabels(np.unique(true_labels))
    ax.set_yticklabels(np.unique(true_labels))
    ax.xaxis.set_ticks_position('bottom')

    # Add numbers to each cell
    for (i, j), val in np.ndenumerate(cm_normalized):
        ax.text(j, i, f"{val:.2%}", ha='center', va='center', color='black')

    # Display the plot
    plt.show()

In [None]:
# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
data_dir = working_dir + "/results/TumorSiteClassification/"

# Load results
true_labels = load_mat_file(data_dir + "Confusion_Val_Lab.mat", "labels")
predictions = load_mat_file(data_dir + "Confusion_Val.mat", "predictions")

In [None]:
# Compute classification metrics
accuracy, precision, recall, f1 = compute_metrics(true_labels, predictions)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Plot confusion matrix
plot_confusion_matrix(true_labels, predictions)

# Knock Up and Down

In [None]:
# Load the inference results
kup = load_mat_file(data_dir + "Knockup.mat", "Knockup")
kdown = load_mat_file(data_dir + "Knockdown.mat", "Knockdown")

# Adjust dimension
kup = kup.T
kdown = kdown.T

# Load list of genes used
selected_genes_df = pd.read_csv(data_dir + 'selected_genes.csv')


In [None]:
def find_sample_changes(predictions, modifications):
    # Changes matrix: 1 if changed, 0 if not
    changes = modifications != predictions[:, None]
    # Sum changes for each gene
    return np.sum(changes, axis=0)


changes_kup = find_sample_changes(predictions, kup)
changes_kdown = find_sample_changes(predictions, kdown)

gene_effects_df = pd.DataFrame({
    'Affected_samples_kup': changes_kup,
    'Affected_samples_kdown': changes_kdown
}, index=selected_genes_df['Gene Name'])

# Sum kup and kdown changes to get total effect per gene
gene_effects_df['Total_affected_samples'] = gene_effects_df['Affected_samples_kup'] + gene_effects_df['Affected_samples_kdown']

# Sort the DataFrame by Total_affected_samples
gene_effects_df = gene_effects_df.sort_values(by='Total_affected_samples', ascending=False).reset_index()

# Remove genes with no effect
gene_effects_df = gene_effects_df[gene_effects_df['Total_affected_samples'] != 0]


print(gene_effects_df)

In [None]:
# Make a subset of genes with more consensus
modules = gene_effects_df[gene_effects_df['Total_affected_samples'] > 50]


# Create a histogram of Total_affected_samples
plt.hist(modules['Total_affected_samples'], bins=100, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel('Total Affected Samples')
plt.ylabel(f'Samples (total {modules.shape[0]})')
plt.title('Histogram of Total Affected Samples')

# Show the plot
plt.show()

In [None]:
working_dir = os.getcwd()
working_dir = working_dir.strip('GCNN_paper_adaptation')
data_dir = working_dir + 'Thesis/data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the datasets
# Transcriptomics Data 
transcriptomics_TumorOnly_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_TumorOnly.csv'
transcriptomics_dataset = pd.read_csv(transcriptomics_TumorOnly_dir, index_col=0)

# Classification Tags
labels_classification_dir = data_dir + 'TumourSite_for_TumorSamples_Classification.csv'
labels = pd.read_csv(labels_classification_dir, index_col=0)

In [None]:
modules