In [1]:
# Import standard libraries
import os
import ast
import glob
import pickle
import platform
import copy
from timeit import default_timer as timer

# Import third-party libraries
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import proplot as pplt
import umap
import seaborn as sn
import tensorflow as tf
from tensorflow.keras import layers, Model, callbacks
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical

from vendi_score import vendi

import sklearn.manifold as skma
import sklearn.metrics as skm
import sklearn.decomposition as skd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import pdist, squareform
from spektral.layers import GINConvBatch, GlobalAttentionPool, GlobalMaxPool, GlobalAttnSumPool

# Import local modules
from topo_sim.model import KLDivergenceLayer, Sampling

# Configuration for file paths
DATA_DIR = '/home/sj0161/complex_polymer/complex_polymer/temp/' # TODO: change this
PLOT_DIR = '../fig/'
WEIGHT_DIR = '/scratch/gpfs/sj0161/20230829/'

# Set plot configurations
pplt.rc['figure.facecolor'] = 'white'

# Initialize color cycle
COLORS = []
colors1 = pplt.Cycle('default')
colors2 = pplt.Cycle('538')

for color in colors1:
    COLORS.append(color['color'])

for color in colors2:
    COLORS.append(color['color'])

# Handle warnings
import warnings
warnings.filterwarnings('ignore')

# Some constants
LATENT_DIM = 8

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
2023-11-16 12:39:54.910654: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-16 12:39:55.015032: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_data(data_dir, fold, n_fold=5, if_validation=False):
    """
    Load and preprocess data from the specified directory.

    Args:
        data_dir (str): Directory path where the data is stored.
        fold (int): Index of the fold to be used as the test set.
        n_fold (int, optional): Number of folds to split the data into. Default is 5.
        if_validation (bool, optional): Whether to include a validation set. Default is False.

    Returns:
        tuple: Tuple containing training, validation (optional), and test datasets,
               along with topo_class names, scaler, and label encoder.
    """
    
    with open(data_dir, 'rb') as handle:
        x, y, topo_desc, topo_class, poly_param, graph = [pickle.load(handle) for _ in range(6)]
    
    # x: graph feature
    # y: rg2 value
    # topo_desc: topological descriptors
    # topo_class: topology classes
    # poly_param: polymer generation parameters
    # graph: networkx objects
    
    # preprocessing
    y = y[..., 0]
    
    SCALER = StandardScaler()
    topo_desc = SCALER.fit_transform(topo_desc)

    topo_class[topo_class == 'astar'] = 'star'
    topo_desc = np.where(np.isnan(topo_desc), -2, topo_desc) # only node assortativity has 0, should be [-1, 1]

    le = LabelEncoder()
    topo_class = le.fit_transform(topo_class)
    NAMES = le.classes_
    
    # random shuffle
    x = np.random.RandomState(0).permutation(x)
    y = np.random.RandomState(0).permutation(y)
    topo_class = np.random.RandomState(0).permutation(topo_class)
    topo_desc = np.random.RandomState(0).permutation(topo_desc)
    poly_param = np.random.RandomState(0).permutation(poly_param)
    graph = np.random.RandomState(0).permutation(graph)

    # we just use one fold for testing
    skf = StratifiedKFold(n_splits=n_fold)
    count = -1
    for _, (train_idx, test_idx) in enumerate(skf.split(x, topo_class)):
        train_data = [data[train_idx] for data in [x, y, topo_desc, topo_class, graph]]
        test_data = [data[test_idx] for data in [x, y, topo_desc, topo_class, graph]]
        x_train, y_train, l_train, c_train, graph_train = train_data
        x_test, y_test, l_test, c_test, graph_test = test_data

        if if_validation:
            skf2 = StratifiedKFold(n_splits=n_fold)
            train_idx2, valid_idx = next(iter(skf2.split(x_train, c_train)))
            x_valid, y_valid, l_valid, c_valid, graph_valid = (
                [data[valid_idx] for data in [x_train, y_train, l_train, c_train, graph_train]])
            x_train, y_train, l_train, c_train, graph_train = (
                [data[train_idx2] for data in [x_train, y_train, l_train, c_train, graph_train]])

                
        count += 1
        if count == fold:
            break

    if if_validation:
        print(f'Train: {len(x_train)} Valid: {len(x_valid)} Test: {len(x_test)}')
        return ((x_train, y_train, c_train, l_train, graph_train),
                (x_valid, y_valid, c_valid, l_valid, graph_valid),
                (x_test, y_test, c_test, l_test, graph_test),
                NAMES, SCALER, le)
            
    else:
        print(f'Train: {len(x_train)} Test: {len(x_test)}')
        return ((x_train, y_train, c_train, l_train, graph_train),
                (x_test, y_test, c_test, l_test, graph_test),
                NAMES, SCALER, le)

    
def graph_to_lap_spec(graphs):
    lap_spec_data = []
    for G in graphs:
        lap_spec = nx.laplacian_spectrum(G)
        lap_spec_zero_pad = np.zeros((100,))
        lap_spec_zero_pad[:len(lap_spec)] = lap_spec
        lap_spec_data.append(lap_spec_zero_pad)
    return np.array(lap_spec_data)

### Vendi score evaluation for the whole dataset

In [3]:
((x_train, y_train, c_train, l_train, graph_train),
(x_valid, y_valid, c_valid, l_valid, graph_valid),
(x_test, y_test, c_test, l_test, graph_test),
NAMES, SCALER, LE) = load_data(os.path.join(DATA_DIR, 'rg2.pickle'), fold=0, if_validation=True)

graph_all = np.concatenate((graph_train, graph_valid, graph_test))

Train: 858 Valid: 215 Test: 269


In [4]:
# convert all graphs into graph eigen spectra
graph_total = [graph_train, graph_valid, graph_test]

lap_spec_data = []

for graphs in graph_total:
    for G in graphs:
        lap_spec = nx.laplacian_spectrum(G)
        lap_spec_zero_pad = np.zeros((100,))
        lap_spec_zero_pad[:len(lap_spec)] = lap_spec
        lap_spec_data.append(lap_spec_zero_pad)
        
lap_spec_data = np.array(lap_spec_data)

with open("../result/lap_spec_data.pickle", "wb") as handle:
    pickle.dump(lap_spec_data, handle)

In [5]:
with open("../result/lap_spec_data.pickle", "rb") as handle:
    lap_spec_data = pickle.load(handle)

In [6]:
print(f"Dataset Vendi Score: {vendi.score_dual(lap_spec_data):0.4f}")

Dataset Vendi Score: 2.0968


### Vendi score evaluation for the latent space

In [7]:
files = [
    "../result/latent_space_desc_gnn_cnn.pickle",
    "../result/latent_space_gnn_cnn.pickle",
    "../result/latent_space_desc_dnn_cnn.pickle"
]

for file in files:
    with open(file, "rb") as handle:
        latent_data = pickle.load(handle)
    print(file)
    print(f"Dataset Vendi Score: {vendi.score_dual(latent_data):0.4f} \n")

../result/latent_space_desc_gnn_cnn.pickle
Dataset Vendi Score: 7.3225 

../result/latent_space_gnn_cnn.pickle
Dataset Vendi Score: 7.4370 

../result/latent_space_desc_dnn_cnn.pickle
Dataset Vendi Score: 7.0863 



In [8]:
files = [
    "../result/latent_space_False_False.pickle",
    "../result/latent_space_False_True.pickle",
    "../result/latent_space_True_False.pickle"
]

for file in files:
    with open(file, "rb") as handle:
        latent_data = pickle.load(handle)
    print(file)
    print(f"Dataset Vendi Score: {vendi.score_dual(latent_data):0.4f} \n")

../result/latent_space_False_False.pickle
Dataset Vendi Score: 5.8532 

../result/latent_space_False_True.pickle
Dataset Vendi Score: 6.3171 

../result/latent_space_True_False.pickle
Dataset Vendi Score: 5.3128 



### Vendi score evaluation for the random generation based on different models

In [9]:
with open("../result/no_valid_random_gen_desc_gnn_cnn.pickle", "rb") as handle:
    gen_data = pickle.load(handle)
    
gen_clean_graph = [gen_data[i][2] for i in range(len(gen_data))]

lap_spec_data = graph_to_lap_spec(gen_clean_graph)

print(f"Dataset Vendi Score: {vendi.score_dual(lap_spec_data):0.4f}")

Dataset Vendi Score: 5.0684


In [10]:
with open("../result/no_valid_random_gen_gnn_cnn.pickle", "rb") as handle:
    gen_data = pickle.load(handle)
    
gen_clean_graph = [gen_data[i][2] for i in range(len(gen_data))]

lap_spec_data = graph_to_lap_spec(gen_clean_graph)

print(f"Dataset Vendi Score: {vendi.score_dual(lap_spec_data):0.4f}")

Dataset Vendi Score: 4.9580


In [11]:
with open("../result/no_valid_random_gen_desc_dnn_cnn.pickle", "rb") as handle:
    gen_data = pickle.load(handle)
    
gen_clean_graph = [gen_data[i][2] for i in range(len(gen_data))]

lap_spec_data = graph_to_lap_spec(gen_clean_graph)

print(f"Dataset Vendi Score: {vendi.score_dual(lap_spec_data):0.4f}")

Dataset Vendi Score: 4.3305


In [None]:
with open("../result/no_valid_random_gen_desc_gnn_cnn.pickle", "rb") as handle:
    gen_data = pickle.load(handle)
    
gen_clean_graph = [gen_data[i][2] for i in range(len(gen_data))]

lap_spec_data = graph_to_lap_spec(gen_clean_graph)

print(f"Dataset Vendi Score: {vendi.score_dual(lap_spec_data):0.4f}")