In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG
from rdkit.Chem import rdFingerprintGenerator
from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector
from qiskit.circuit import Parameter, ParameterVector
from qiskit_machine_learning.algorithms.classifiers import NeuralNetworkClassifier, VQC
from qiskit_machine_learning.algorithms.regressors import NeuralNetworkRegressor, VQR
from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN
from qiskit_machine_learning.utils.loss_functions.loss_functions import CrossEntropyLoss
from qiskit_algorithms.optimizers import COBYLA, L_BFGS_B
from qiskit.circuit.library import Initialize
import matplotlib.pyplot as plt
import deepchem as dc
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import os

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!


Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Skipped loading some Jax models, missing a dependency. No module named 'haiku'


In [2]:
num_qubits = 12
fpgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2**num_qubits)
# fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=5, includeChirality=True, fpSize=2**num_qubits)
# fpgen = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2**num_qubits, includeChirality=True)
df = pd.read_csv('bace.csv')
df_smiles = df['mol'].to_numpy()
df_morgan = np.zeros([len(df_smiles), 2**num_qubits])
df_classes = df['Class'].to_numpy()
df_classes = df_classes + 1
for i in range(len(df_smiles)):
    mol = Chem.MolFromSmiles(df_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    # array = array / np.linalg.norm(array)
    df_morgan[i] = array
unq, count = np.unique(df_morgan, axis=0, return_counts=True)
repeated_groups = unq[count > 1]
print(repeated_groups.shape)
for repeated_group in repeated_groups:
    repeated_idx = np.argwhere(np.all(df_morgan == repeated_group, axis=1))
    print(repeated_idx.ravel())

dataset = dc.data.DiskDataset.from_numpy(X=df_classes,ids=df_smiles)
scaffoldsplitter = dc.splits.ScaffoldSplitter()
train, valid, test = scaffoldsplitter.train_valid_test_split(dataset)
train_classes = train.X
valid_classes = valid.X
test_classes = test.X
train_smiles = train.ids
valid_smiles = valid.ids
test_smiles = test.ids
train_num = len(train_smiles)
valid_num = len(valid_smiles)
test_num = len(test_smiles)
df_train_morgan = np.zeros([train_num, 2**num_qubits])
df_valid_morgan = np.zeros([valid_num, 2**num_qubits])
df_test_morgan = np.zeros([test_num, 2**num_qubits])
for i in range(train_num):
    mol = Chem.MolFromSmiles(train_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    # array = array / np.linalg.norm(array)
    df_train_morgan[i] = array
for i in range(valid_num):
    mol = Chem.MolFromSmiles(valid_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    # array = array / np.linalg.norm(array)
    df_valid_morgan[i] = array
for i in range(test_num):
    mol = Chem.MolFromSmiles(test_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    # array = array / np.linalg.norm(array)
    df_test_morgan[i] = array

(46, 4096)
[ 181 1488]
[ 136 1025]
[121 939]
[  3 581]
[1369 1475]
[ 646 1034]
[ 71 638]
[128 991]
[109 866]
[ 139 1043]
[ 108 1436]
[ 161 1284]
[127 984]
[123 232]
[ 164 1303]
[1248 1362]
[ 152 1225]
[ 137 1030]
[ 590 1050]
[463 684 959]
[  35 1143]
[ 955 1214]
[ 15  32 158 458]
[ 672 1020]
[298 843]
[ 22 336]
[326 357]
[124 286]
[687 689]
[  50 1130]
[  51 1241]
[ 385 1217]
[ 383 1173]
[ 932 1412]
[ 626 1413]
[ 122 1411]
[ 60 126]
[ 468 1005]
[ 986 1294]
[ 37 140]
[ 97 781]
[1394 1474]
[ 796 1341]
[39 68]
[284 370]
[ 369 1009]


In [16]:
reduced_qubits = 2
df_combined = np.concatenate([df_train_morgan, df_valid_morgan, df_test_morgan], axis=0)
pca = PCA(n_components=2**reduced_qubits, random_state=53,  svd_solver='full')
pca_combined = pca.fit_transform(df_combined)
pca_combined = normalize(pca_combined, norm='l2', axis=1)
pca_train = pca_combined[:train_num]
pca_valid = pca_combined[train_num:train_num+valid_num]
pca_test = pca_combined[train_num+valid_num:]

n = 0
for i in range(len(train_smiles)):
    for j in range(2**reduced_qubits):
        if pca_train[i,j] != 0:
            if n == 0:
                df_pca_con_train_morgan = pca_train[i,j]
                df_pca_con_train_indices = np.array([i,j], dtype=int)
                n = n + 1
            else:
                df_pca_con_train_morgan = np.append(df_pca_con_train_morgan, pca_train[i,j])
                df_pca_con_train_indices = np.append(df_pca_con_train_indices, np.array([i,j]))
n = 0
for i in range(len(valid_smiles)):
    for j in range(2**reduced_qubits):
        if pca_valid[i,j] != 0:
            if n == 0:
                df_pca_con_valid_morgan = pca_valid[i,j]
                df_pca_con_valid_indices = np.array([i,j], dtype=int)
                n = n + 1
            else:
                df_pca_con_valid_morgan = np.append(df_pca_con_valid_morgan, pca_valid[i,j])
                df_pca_con_valid_indices = np.append(df_pca_con_valid_indices, np.array([i,j]))
n = 0
for i in range(len(test_smiles)):
    for j in range(2**reduced_qubits):
        if pca_test[i,j] != 0:
            if n == 0:
                df_pca_con_test_morgan = pca_test[i,j]
                df_pca_con_test_indices = np.array([i,j], dtype=int)
                n = n + 1
            else:
                df_pca_con_test_morgan = np.append(df_pca_con_test_morgan, pca_test[i,j])
                df_pca_con_test_indices = np.append(df_pca_con_test_indices, np.array([i,j]))



df_pca_con_train_morgan = df_pca_con_train_morgan.T
df_pca_con_valid_morgan = df_pca_con_valid_morgan.T
df_pca_con_test_morgan = df_pca_con_test_morgan.T
df_pca_con_train_indices = df_pca_con_train_indices + 1
df_pca_con_valid_indices = df_pca_con_valid_indices + 1
df_pca_con_test_indices = df_pca_con_test_indices + 1
df_pca_new_con_train_indices = np.reshape(df_pca_con_train_indices, (-1,2))
df_pca_new_con_valid_indices = np.reshape(df_pca_con_valid_indices, (-1,2))
df_pca_new_con_test_indices = np.reshape(df_pca_con_test_indices, (-1,2))
df_train_input = np.zeros([len(train_smiles),1])
df_valid_input = np.zeros([len(valid_smiles),1])
df_test_input = np.zeros([len(test_smiles),1])

newpath = f'/Users/choyboy/Documents/Python/QML/bace_dataset_topol_split/bace_pca_{num_qubits}q{reduced_qubits}'
if not os.path.exists(newpath):
    os.makedirs(newpath)

np.savetxt(f'{newpath}/qc_pca_train_initial_bace_{num_qubits}q{reduced_qubits}', df_pca_con_train_morgan, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_train_initial_index_bace_{num_qubits}q{reduced_qubits}', df_pca_new_con_train_indices.astype(int), fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_train_classes_bace_{num_qubits}q{reduced_qubits}', train_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_train_input_bace_{num_qubits}q{reduced_qubits}', df_train_input, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_initial_bace_{num_qubits}q{reduced_qubits}', df_pca_con_valid_morgan, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_initial_index_bace_{num_qubits}q{reduced_qubits}', df_pca_new_con_valid_indices.astype(int), fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_classes_bace_{num_qubits}q{reduced_qubits}', valid_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_input_bace_{num_qubits}q{reduced_qubits}', df_valid_input, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_initial_bace_{num_qubits}q{reduced_qubits}', df_pca_con_test_morgan, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_initial_index_bace_{num_qubits}q{reduced_qubits}', df_pca_new_con_test_indices.astype(int), fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_classes_bace_{num_qubits}q{reduced_qubits}', test_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_input_bace_{num_qubits}q{reduced_qubits}', df_test_input, delimiter='\t')

In [3]:
reduced_qubits = 2
df_combined = np.concatenate([df_train_morgan, df_valid_morgan, df_test_morgan], axis=0)
pca = PCA(n_components=reduced_qubits, random_state=53,  svd_solver='full')
pca_combined = pca.fit_transform(df_combined)
pca_combined = normalize(pca_combined, norm='l2', axis=1)
pca_train = pca_combined[:train_num] * np.pi
pca_valid = pca_combined[train_num:train_num+valid_num] * np.pi
pca_test = pca_combined[train_num+valid_num:] * np.pi


newpath = f'/Users/choyboy/Documents/Python/QML/bace_dataset_topol_angle_split/bace_pca_{num_qubits}q{reduced_qubits}'
if not os.path.exists(newpath):
    os.makedirs(newpath)

np.savetxt(f'{newpath}/qc_pca_train_classes_bace_{num_qubits}q{reduced_qubits}', train_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_train_input_bace_{num_qubits}q{reduced_qubits}', pca_train, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_classes_bace_{num_qubits}q{reduced_qubits}', valid_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_input_bace_{num_qubits}q{reduced_qubits}', pca_valid, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_classes_bace_{num_qubits}q{reduced_qubits}', test_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_input_bace_{num_qubits}q{reduced_qubits}', pca_test, delimiter='\t')