In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

In [5]:
# Read the data from the CSV file
d_qml_raw = pd.read_csv('../data/raw/QAOA-Layer-Exploration_runs.csv', index_col=0)

In [6]:
d_qml_raw.head()
# Unpack the params column into separate columns
d_qml = pd.concat([d_qml_raw.drop(['params'], axis=1), d_qml_raw['params'].apply(eval).apply(pd.Series)], axis=1)

In [7]:
graph_features = [
    'acyclic',
    'algebraic_connectivity',
    'average_distance',
    'bipartite',
    'clique_number',
    'coefficient_of_variation',
    'connected',
    'density',
    'diameter',
    'edge_connectivity',
    'entropy',
    'eulerian',
    'first_quartile',
    'interquartile_range',
    'is_distance_regular',
    'kurtosis_weight',
    'laplacian_largest_eigenvalue',
    'laplacian_second_largest_eigenvalue',
    'maximum_degree',
    'maximum_weighted_degree',
    'mean_weight',
    'median_weight',
    'minimum_degree',
    'minimum_dominating_set',
    'minimum_weighted_degree',
    'number_of_components',
    'number_of_cut_vertices',
    'number_of_edges',
    'number_of_minimal_odd_cycles',
    'number_of_orbits',
    'number_of_vertices',
    'planar',
    'radius',
    'range_weight',
    'ratio_of_two_largest_laplacian_eigenvaleus',
    'regular',
    'skewness_weight',
    'smallest_eigenvalue',
    'std_dev_weight',
    'third_quartile',
    'variance_weight',
    'vertex_connectivity',
    'weighted_average_clustering',
    'weighted_average_shortest_path_length',
    'weighted_diameter',
    'weighted_radius'
]
# select the graph features
d_features = d_qml[['run_id'] + graph_features] 

In [11]:
import json
# Read the QAOA optimal parameters for the graphs based on the run_id
def get_optimal_params(run_id, depth=2):
    # find the file with the optimal parameters
    filepath = f'../optimized_parameters/{run_id}_optimized_parameters.json'
    # read the optimal parameters as json
    with open(filepath, 'r') as f:
        optimal_params = json.load(f)
    # return the optimal parameters for a given depth
    optimal_params = optimal_params[depth-1]
    return optimal_params

get_optimal_params("5de57b42357f4015a2b713a16bcec195", 2)


{'layer': 2,
 'gamma': [-82.62166411505994, -5.285405489169279],
 'beta': [27.140365056353623, 8.222685413023193]}

In [12]:
# drop duplicates
d_features = d_features.drop_duplicates()

In [13]:
d_features['optimal_params'] = d_features['run_id'].apply(lambda x: get_optimal_params(x, 2))

In [14]:
print(f"Conducting feature engineering on {d_features.shape[0]} samples with {d_features.shape[1]} features")
# Extract the features (all columns except 'optimal_params')
X = d_features.drop(columns=['run_id', 'optimal_params'])
# Remove features that are constant
X = X.loc[:, X.apply(pd.Series.nunique) != 1]
print(f"Removed constant feature, {X.shape[1]} features remaining")
bool_columns = X.select_dtypes(include=['bool']).columns
# Convert `bool` columns to `int`
X[bool_columns] = X[bool_columns].astype(int)

Conducting feature engineering on 1914 samples with 48 features
Removed constant feature, 42 features remaining


In [15]:
# Create y as the optimal parameters
y = d_features['optimal_params']
# Extract the gamma and beta parameters
y = pd.DataFrame(y.tolist(), columns=['gamma', 'beta'])
# Unlist the optimal parameters into separate columns with `gamma_i` and `beta_i` as column names
y = pd.concat([y['gamma'].apply(pd.Series).add_prefix('gamma_'), y['beta'].apply(pd.Series).add_prefix('beta_')], axis=1)
# 

In [16]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
X_train.shape, y_train.shape

((1531, 42), (1531, 4))