In [None]:
pip install reval

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy import __version__ as scipy_version
import joblib
from sklearn import __version__ as sklearn_version
from umap import __version__ as umap_version
import matplotlib as mpl

In [None]:
def generate_graph2vec_embeddings_and_predict_clusters(input_file_path, reference_file_path, output_file_path):
    # Load the data from the input Excel file into a Pandas DataFrame
    df = pd.read_excel(input_file_path, index_col=0)

    # Load the reference samples from the Excel file into a Pandas DataFrame
    df_2 = pd.read_excel(reference_file_path, index_col=0)

    # Combine both DataFrames to create a single DataFrame containing all samples
    reference_df = df_2

    # Initialize an empty dictionary to store SSNs for each single sample
    ssn_dict = {}

    # Iterate through each row as a single test sample
    for sample_id, single_sample in df.iterrows():
        # Remove the single test sample from the DataFrame for reference samples
        reference_samples = reference_df

        # Calculate PCC for the reference samples (excluding the first column)
        reference_pcc_matrix = reference_samples.corr()

        # Construct the perturbed network by adding the single test sample
        perturbed_reference_samples = reference_samples._append(single_sample)
        perturbed_pcc_matrix = perturbed_reference_samples.corr()

        # Calculate the differential network by taking the difference
        differential_pcc_matrix = perturbed_pcc_matrix - reference_pcc_matrix

        # Define a significance threshold
        significance_threshold = 0.05

        # Create the SSN for the individual sample based on significant edges
        ssn = differential_pcc_matrix[abs(differential_pcc_matrix) > significance_threshold]

        # Store the SSN in the dictionary
        ssn_dict[sample_id] = ssn

    # Define the personalized_networks dictionary to store patient-specific graphs
    personalized_networks = {}

    # Iterate through each patient's SSN
    for patient_id, correlation_matrix in ssn_dict.items():
        # Create a NetworkX graph
        G = nx.Graph()

        # Add nodes (biomarkers)
        nodes = list(correlation_matrix.columns)
        G.add_nodes_from(nodes)

        # Add edges with correlation greater than the significance threshold
        for i in range(len(nodes)):
            for j in range(i + 1, len(nodes)):
                correlation = correlation_matrix.iloc[i, j]
                if abs(correlation) > significance_threshold:
                    G.add_edge(nodes[i], nodes[j], weight=correlation)

        # Store the personalized network
        G = nx.convert_node_labels_to_integers(G, first_label=0, ordering='default')
        personalized_networks[patient_id] = G

    # Initialize an empty dictionary to store Graph2Vec embeddings for each patient
    unarrival_graph2vec_embeddings = {}

    # Iterate through each patient's graph
    for patient_id, patient_graph in personalized_networks.items():
        # Convert the NetworkX graph to a format compatible with Karate Club
        G = patient_graph

        # Find the largest subgraph of the full graph
        max_subgraph = max(nx.connected_components(G), key=len)
        max_subgraph = G.subgraph(max_subgraph)

        # Calculate features of the largest subgraph
        num_nodes = len(max_subgraph.nodes)
        num_edges = len(max_subgraph.edges)
        average_clustering = nx.average_clustering(max_subgraph)
        subgraph_diameter = nx.diameter(max_subgraph)

        # Create a feature vector for the largest subgraph
        subgraph_feature_vector = [num_nodes, num_edges, average_clustering, subgraph_diameter]

        # Calculate features of the full graph
        num_nodes = len(G.nodes)
        num_edges = len(G.edges)
        num_connected_components = nx.number_connected_components(G)

        # Create a feature vector for the full graph
        feature_vector = [num_nodes, num_edges, num_connected_components]
        feature_vector.extend(subgraph_feature_vector)

        # Calculate node degrees
        node_degrees = dict(G.degree())
        degree_vector = [node_degrees.get(node, 0) for node in range(24)]

        # Add degree vector to the feature vector
        feature_vector.extend(degree_vector)

        # Store the Graph2Vec embedding for the patient
        unarrival_graph2vec_embeddings[patient_id] = feature_vector

    # Convert the dictionary to a DataFrame
    df_embeddings = pd.DataFrame.from_dict(unarrival_graph2vec_embeddings, orient='index')

    # Reset the index to have the subject key as a column
    df_embeddings.reset_index(inplace=True)

    # Rename the columns
    df_embeddings.columns = ['subject_key'] + [f'feature_{i}' for i in range(1, len(df_embeddings.columns))]

    # Save the DataFrame to Excel
    df_embeddings.to_excel(output_file_path, index=False)

    # Now, use the generated embeddings to predict cluster labels
    # Load trained models
    umap = joblib.load('umap_model.pkl')
    scaler = joblib.load('scaler_model.pkl')
    imputer = joblib.load('imputer_model.pkl')
    kmeans = joblib.load('kmeans_model.pkl')

    # Read the input data
    X_new = df_embeddings.drop(columns=['subject_key'])

    # Impute missing values
    X_new_imputed = pd.DataFrame(imputer.transform(X_new), columns=X_new.columns, index=X_new.index)

    # Scale the data
    X_new_scaled = pd.DataFrame(scaler.transform(X_new_imputed), columns=X_new.columns, index=X_new.index)

    # UMAP dimensionality reduction
    new_umap_results = pd.DataFrame(index=X_new.index)
    for sample_index, sample_row in X_new_scaled.iterrows():
        combined_data = pd.concat([X_new_scaled, pd.DataFrame(sample_row).transpose()], axis=0)
        umap_result = umap.transform(combined_data)
        new_sample_umap_result = pd.DataFrame(umap_result[-1:], columns=["UMAP1", "UMAP2"], index=[sample_index])
        cluster_labels_new = kmeans.predict(new_sample_umap_result[['UMAP1', 'UMAP2']])
        # 定义映射字典
        label_mapping = {0: 2, 1: 3, 2: 1, 3: 4}
        # 对聚类标签进行映射
        mapped_cluster_label = label_mapping.get(cluster_labels_new[0], cluster_labels_new[0])

        new_umap_results.loc[sample_index, 'Cluster_Label'] = mapped_cluster_label

    # 在生成结果时包含初始文件的Name列信息
    df = pd.read_excel(input_file_path)
    new_umap_results['Name'] = df[['Name']]

    # Save only the Name and Cluster_Label columns to the Excel file
    new_umap_results[['Name', 'Cluster_Label']].to_excel(output_file_path, index=False)

# Example usage:
generate_graph2vec_embeddings_and_predict_clusters("/content/your_file.xlsx", "/content/reference_samples.xlsx", "/content/final_result.xlsx")
