In [None]:
import os
import sys
import concurrent.futures

# Get the current and parent directory
current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)

# Add the parent directory to sys.path
sys.path.append(f'{root_dir}/utilities')

from utility import *
from utility_prompt import *

from ogb.graphproppred import DglGraphPropPredDataset
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import GridSearchCV, PredefinedSplit

import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, Descriptors

In [2]:
%load_ext autoreload
%autoreload 2

# Load Dataset

In [None]:
# Specify the dataset name
dataset_name = "ESOL" # ESOL, FreeSolv, BBBP, BACE

# Load dataset
datasets = dataset = DglGraphPropPredDataset(name = f"ogbg-mol{normalize_dataset_name(dataset_name)}", root = f'{root_dir}/datasets/')
# Load SMILES strings
df = pd.read_csv(f"{root_dir}/datasets/ogbg_mol{normalize_dataset_name(dataset_name)}/mapping/mol.csv.gz", compression='gzip')
SMILES = list(df["smiles"])
y = np.array(datasets.labels.reshape(1, -1).tolist()[0])
num_data = len(y)
print(f'Num of graphs: {num_data}')

# Step 1: Concept Generation

In [None]:
# Specify number of concepts
num_properties = 20
system_prompt_step_1 = read_prompt(dataset_name, llm_model='GPT-3.5 turbo', step_idx=1, is_system=True, dir=root_dir)
individual_prompt_step_1 = read_prompt(dataset_name, llm_model='GPT-3.5 turbo', step_idx=1, is_individual=True, dir=root_dir).format(num_properties=num_properties)

In [None]:
concept_list = get_completion_gpt(individual_prompt_step_1, system_prompt_step_1)
concept_list = [concept[2:].lower() for concept in concept_list.split("\n")]
print('- '+'\n- '.join(concept_list))

# Step 2: Concept Labeling

## Strategy 1: Direct LLM Prompting

In [56]:
system_prompt_step_2 = read_prompt(dataset_name, llm_model='GPT-3.5 turbo', step_idx=2, is_individual=False, dir=root_dir)
individual_prompt_step_2 = read_prompt(dataset_name, llm_model='GPT-3.5 turbo', step_idx=2, is_individual=True, dir=root_dir)

def worker(smiles):
    try:
        return get_completion_gpt(individual_prompt_step_2.format(compound_name=smiles, property_list='- '+'\n- '.join(concept_list)), system_prompt_step_2)
    except:
        return None

In [57]:
# Please adjust max_workers if exceeding OpenAI rate limits
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Map the function and prompts to the executor
    results = executor.map(worker, SMILES)
    concept_values = list(results)

In [59]:
# parse concept values
parsed_concept_values = []
for values in concept_values:
    parsed_concept_values.append(parse_entry(values.split('\n'), concept_list))
concept_values = pd.DataFrame(parsed_concept_values)

## Strategy 2: Function Code Generation

In [None]:
function_prompt_natural_language = read_prompt(dataset_name, strategy='func', is_system=False, step_idx=1,  dir=root_dir)
system_prompt_function_generation = read_prompt(dataset_name, strategy='func', is_system=True, step_idx=2, dir=root_dir)
function_prompt_function_generation = read_prompt(dataset_name, strategy='func', is_system=False, step_idx=2, dir=root_dir)

In [None]:
# generate function
func_list = []
llm_model = 'gpt-4-1106-preview'
for concept in concept_list:
    natural_language_description = get_completion_gpt(prompt=function_prompt_natural_language.format(property_name=concept),
                                                      model=llm_model)
    function_output = get_completion_gpt(prompt=function_prompt_function_generation.format(property_name=concept),
                                         system_prompt=system_prompt_function_generation,
                                         history=[function_prompt_natural_language.format(property_name=concept), natural_language_description],
                                         model=llm_model)
    write_function(dataset=normalize_dataset_name(dataset_name),
                     llm_model=llm_model,
                     property_name=concept,
                     description=natural_language_description,
                     function_output=function_output)
    print(f'Generated function for {concept}')

In [None]:
# prepare graph
num_graphs = len(dataset.graphs)

adjs = []
node_features_graphs = []
edge_features_graphs = []

for i in range(num_graphs):
    adjs.append(dataset.graphs[i].adj().to_dense())
    node_features_graphs.append([atom_feature_vector_to_dict_full_name(node) for node in dataset.graphs[i].ndata['feat']])
    edge_features_graphs.append([bond_feature_vector_to_dict_full_name(edge) for edge in dataset.graphs[i].edata['feat']])

Load generated function into function list `func_list`.

In [None]:
concept_values = []
for i in range(num_graphs):
    entry = {}
    for concept, func in zip(concept_list, func_list):
        entry[concept] = func(adjs[i], node_features_graphs[i], edge_features_graphs[i])
    concept_values.append(entry)
concept_values = pd.DataFrame(concept_values)

## Strategy 3: External Tool Calling

In [70]:
system_prompt_step_2 = read_prompt(dataset_name, strategy='tool', is_system=True, dir=root_dir)
individual_prompt_step_2 = read_prompt(dataset_name, strategy='tool', is_system=False, dir=root_dir)

In [None]:
concept_tool_list = get_completion_gpt(individual_prompt_step_2.format(property_list='- '+'\n- '.join(concept_list)), system_prompt_step_2, model="gpt-4-1106-preview")
concept_tool_list = parse_tools_list(concept_tool_list.split('\n'))
concept_tool_list

In [96]:
# may ignor warnings
parsed_concept_values = []
for smiles in SMILES:
    mol = rdkit.Chem.MolFromSmiles(smiles)
    parsed_concept_values.append(get_rdkit_values(mol, concept_tool_list))
concept_values = pd.DataFrame(parsed_concept_values).dropna(axis=1)

# Step 3: CM Fitting and Concept Selection

In [97]:
# concept values
X = concept_values.values

In [101]:
# Split data follows the same split as DGL
split_idx = datasets.get_idx_split()

train_idx = split_idx['train'].numpy()
valid_idx = split_idx['valid'].numpy()
test_idx = split_idx['test'].numpy()

X_train, X_valid, X_test = X[train_idx], X[valid_idx], X[test_idx]
y_train, y_valid, y_test = y[train_idx], y[valid_idx], y[test_idx]

In [None]:
# data cleaning
X_train_clean, rows_filled_train = fill_nones_with_column_average(X_train)
X_valid_clean, rows_filled_valid = fill_nones_with_column_average(X_valid)
X_test_clean, rows_filled_test = fill_nones_with_column_average(X_test)

## Linear Regression / Logistic Regression

In [104]:
is_regression_task = normalize_dataset_name(dataset_name) in ['esol', 'freesolv']
is_classification_task = normalize_dataset_name(dataset_name) in ['bbbp', 'bace']

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clean)
X_valid_scaled = scaler.transform(X_valid_clean)
X_test_scaled = scaler.transform(X_test_clean)

# Use linear regression + AIC feature selction for regression tasks (ESOL, FreeSolv)
# Use Logitic regression + RFE feature selection for classification tasks (BBBP, BACE)
if is_regression_task:
    # Feature selection
    best_aic, best_concepts = select_features_AIC(X_train_scaled, y_train, X_valid_scaled, y_valid)
    best_concept_names = [concept_list[i] for i in best_concepts]
    non_best_concept_names = list(set(concept_list) - set(best_concept_names))
    print(f'Best AIC: {best_aic}')
    print("Best features by AIC:", end="\n  * ")
    print(best_concept_names, sep="\n  * ")

    # Use selected features
    X_train_subset = X_train_scaled[:, best_concepts]
    X_valid_subset = X_valid_scaled[:, best_concepts]
    X_test_subset = X_test_scaled[:, best_concepts]

    # Fit and predict
    model = LinearRegression()
    model.fit(X_train_subset, y_train)
    
    y_pred_test = model.predict(X_test_subset)
    print("Test Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred_test)))

elif is_classification_task:
    # feature selection
    best_auc, best_concepts = select_features_RFE(X_train_scaled, y_train, X_valid_scaled, y_valid)
    best_concept_names = [concept_list[i] for i in best_concepts]
    non_best_concept_names = list(set(concept_list) - set(best_concept_names))
    print(f'Best AUC-ROC: {best_auc}')
    print("Best features by RFE:", end="\n  * ")
    print(best_concept_names, sep="\n  * ")

    # Use selected features
    X_train_subset = X_train_scaled[:, best_concepts]
    X_valid_subset = X_valid_scaled[:, best_concepts]
    X_test_subset = X_test_scaled[:, best_concepts]

    # Fit and predict
    model = LogisticRegression()
    model.fit(X_train_subset, y_train)

    y_pred_test = model.predict(X_test_subset)
    print(f'Test AUC-ROC: {roc_auc_score(y_test, y_pred_test)}')
else:
    print("Warning: invalid dataset")

## MLP

In [None]:
# Combine training and validation sets and create PredefinedSplit
# Define a range of alpha values to explore
alpha_values = [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30, 50, 80]

# Create an MLPRegressor instance
if is_regression_task:
    mlp = MLPRegressor(hidden_layer_sizes=(200,), max_iter=5000, solver="adam", learning_rate_init=1e-4)
elif is_classification_task:
    mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=5000, solver="adam", learning_rate_init=1e-4)

X_combined = np.vstack((X_train_scaled, X_valid_scaled))
y_combined = np.hstack((y_train, y_valid))
split_index = [-1]*len(X_train_scaled) + [0]*len(X_valid_scaled)
pds = PredefinedSplit(test_fold=split_index)

# Set up GridSearchCV
param_grid = {'alpha': alpha_values}
grid_search = GridSearchCV(mlp, param_grid, cv=pds)

# Fit grid search
grid_search.fit(X_combined, y_combined)

# Best alpha value
best_alpha = grid_search.best_params_['alpha']

if is_regression_task:
    # Fit and predict
    mlp = MLPRegressor(hidden_layer_sizes=(200,), max_iter=5000, solver="adam", learning_rate_init=1e-4, alpha=grid_search.best_params_['alpha'])
    mlp.fit(X_train_scaled, y_train)

    y_pred = mlp.predict(X_test_scaled)
    print("Test: Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
elif is_classification_task:
    mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=5000, solver="adam", learning_rate_init=1e-4, alpha=grid_search.best_params_['alpha'])
    mlp.fit(X_train_scaled, y_train)
    
    y_pred_test = mlp.predict(X_test_scaled)
    print(f'Test AUC-ROC: {roc_auc_score(y_test, y_pred_test)}')

# Iterative Concept Refinement

In [None]:
individual_prompt_step_3 = read_prompt(dataset_name, llm_model='GPT-3.5 turbo', step_idx=3, is_individual=True, dir=root_dir)

In [None]:
new_concept_list = get_completion_gpt(individual_prompt_step_3.format(best_features='\n - '+'\n - '.join(best_concept_names)+'\n', non_best_features= non_best_concept_names, num_features=len(non_best_concept_names)), system_prompt_step_1)
new_concept_list = [concept[2:].lower() for concept in new_concept_list.split("\n")]
print('- '+'\n- '.join(new_concept_list))

Please repeat Step 1 to generate concept values for new features ...