In [None]:
pip install reval

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy import __version__ as scipy_version
import joblib
from sklearn import __version__ as sklearn_version
from umap import __version__ as umap_version
import matplotlib as mpl

In [None]:
def generate_graph2vec_embeddings_and_predict_clusters(input_file_path, reference_file_path, output_file_path):
    df = pd.read_excel(input_file_path, index_col=0)
    df_2 = pd.read_excel(reference_file_path, index_col=0)
    reference_df = df_2
    ssn_dict = {}
    for sample_id, single_sample in df.iterrows():
        reference_samples = reference_df
        reference_pcc_matrix = reference_samples.corr()
        perturbed_reference_samples = reference_samples._append(single_sample)
        perturbed_pcc_matrix = perturbed_reference_samples.corr()
        differential_pcc_matrix = perturbed_pcc_matrix - reference_pcc_matrix
        significance_threshold = 0.05
        ssn = differential_pcc_matrix[abs(differential_pcc_matrix) > significance_threshold]
        ssn_dict[sample_id] = ssn
    personalized_networks = {}
    for patient_id, correlation_matrix in ssn_dict.items():
        G = nx.Graph()
        nodes = list(correlation_matrix.columns)
        G.add_nodes_from(nodes)
        for i in range(len(nodes)):
            for j in range(i + 1, len(nodes)):
                correlation = correlation_matrix.iloc[i, j]
                if abs(correlation) > significance_threshold:
                    G.add_edge(nodes[i], nodes[j], weight=correlation)
        G = nx.convert_node_labels_to_integers(G, first_label=0, ordering='default')
        personalized_networks[patient_id] = G
    unarrival_graph2vec_embeddings = {}
    for patient_id, patient_graph in personalized_networks.items():
        G = patient_graph
        max_subgraph = max(nx.connected_components(G), key=len)
        max_subgraph = G.subgraph(max_subgraph)
        num_nodes = len(max_subgraph.nodes)
        num_edges = len(max_subgraph.edges)
        average_clustering = nx.average_clustering(max_subgraph)
        subgraph_diameter = nx.diameter(max_subgraph)
        subgraph_feature_vector = [num_nodes, num_edges, average_clustering, subgraph_diameter]
        num_nodes = len(G.nodes)
        num_edges = len(G.edges)
        num_connected_components = nx.number_connected_components(G)
        feature_vector = [num_nodes, num_edges, num_connected_components]
        feature_vector.extend(subgraph_feature_vector)
        node_degrees = dict(G.degree())
        degree_vector = [node_degrees.get(node, 0) for node in range(24)]
        feature_vector.extend(degree_vector)
        unarrival_graph2vec_embeddings[patient_id] = feature_vector
    df_embeddings = pd.DataFrame.from_dict(unarrival_graph2vec_embeddings, orient='index')
    df_embeddings.reset_index(inplace=True)
    df_embeddings.columns = ['subject_key'] + [f'feature_{i}' for i in range(1, len(df_embeddings.columns))]
    df_embeddings.to_excel(output_file_path, index=False)
    umap = joblib.load('umap_model.pkl')
    scaler = joblib.load('scaler_model.pkl')
    imputer = joblib.load('imputer_model.pkl')
    kmeans = joblib.load('kmeans_model.pkl')
    X_new = df_embeddings.drop(columns=['subject_key'])
    X_new_imputed = pd.DataFrame(imputer.transform(X_new), columns=X_new.columns, index=X_new.index)
    X_new_scaled = pd.DataFrame(scaler.transform(X_new_imputed), columns=X_new.columns, index=X_new.index)
    new_umap_results = pd.DataFrame(index=X_new.index)
    for sample_index, sample_row in X_new_scaled.iterrows():
        combined_data = pd.concat([X_new_scaled, pd.DataFrame(sample_row).transpose()], axis=0)
        umap_result = umap.transform(combined_data)
        new_sample_umap_result = pd.DataFrame(umap_result[-1:], columns=["UMAP1", "UMAP2"], index=[sample_index])
        cluster_labels_new = kmeans.predict(new_sample_umap_result[['UMAP1', 'UMAP2']])
        label_mapping = {0: 2, 1: 3, 2: 1, 3: 4}
        mapped_cluster_label = label_mapping.get(cluster_labels_new[0], cluster_labels_new[0])
        new_umap_results.loc[sample_index, 'Cluster_Label'] = mapped_cluster_label
    df = pd.read_excel(input_file_path)
    new_umap_results['Name'] = df[['Name']]
    new_umap_results[['Name', 'Cluster_Label']].to_excel(output_file_path, index=False)


# Example usage:
generate_graph2vec_embeddings_and_predict_clusters("/content/your_file.xlsx", "/content/reference_samples.xlsx", "/content/final_result.xlsx")