In [12]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG
from rdkit.Chem import rdFingerprintGenerator
import matplotlib.pyplot as plt
import deepchem as dc
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import os

In [14]:
num_qubits = 12
fpgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2**num_qubits)
df = pd.read_csv('bace.csv')
df_smiles = df['mol'].to_numpy()
df_fp = np.zeros([len(df_smiles), 2**num_qubits])
df_classes = df['Class'].to_numpy()
df_classes = df_classes + 1
for i in range(len(df_smiles)):
    mol = Chem.MolFromSmiles(df_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    df_fp[i] = array
unq, count = np.unique(df_fp, axis=0, return_counts=True)
repeated_groups = unq[count > 1]
print(repeated_groups.shape)
# for repeated_group in repeated_groups:
#     repeated_idx = np.argwhere(np.all(df_fp == repeated_group, axis=1))
#     print(repeated_idx.ravel())

dataset = dc.data.DiskDataset.from_numpy(X=df_classes,ids=df_smiles)
scaffoldsplitter = dc.splits.ScaffoldSplitter()
train, valid, test = scaffoldsplitter.train_valid_test_split(dataset)
train_classes = train.X
valid_classes = valid.X
test_classes = test.X
train_smiles = train.ids
valid_smiles = valid.ids
test_smiles = test.ids
train_num = len(train_smiles)
valid_num = len(valid_smiles)
test_num = len(test_smiles)
df_train_fp = np.zeros([train_num, 2**num_qubits])
df_valid_fp = np.zeros([valid_num, 2**num_qubits])
df_test_fp = np.zeros([test_num, 2**num_qubits])
for i in range(train_num):
    mol = Chem.MolFromSmiles(train_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    df_train_fp[i] = array
for i in range(valid_num):
    mol = Chem.MolFromSmiles(valid_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    df_valid_fp[i] = array
for i in range(test_num):
    mol = Chem.MolFromSmiles(test_smiles[i])
    fp = fpgen.GetFingerprint(mol)
    array = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp, array)
    df_test_fp[i] = array

(46, 4096)


In [11]:
# reduced_qubits = 3
# df_combined = np.concatenate([df_train_fp, df_valid_fp, df_test_fp], axis=0)
# pca = PCA(n_components=2**reduced_qubits)
# pca_combined = pca.fit_transform(df_combined)
# pca_combined = normalize(pca_combined, norm='l2', axis=1)
# pca_train = pca_combined[:train_num]
# pca_valid = pca_combined[train_num:train_num+valid_num]
# pca_test = pca_combined[train_num+valid_num:]

# n = 0
# for i in range(len(train_smiles)):
#     for j in range(2**reduced_qubits):
#         if pca_train[i,j] != 0:
#             if n == 0:
#                 df_pca_con_train_fp = pca_train[i,j]
#                 df_pca_con_train_indices = np.array([i,j], dtype=int)
#                 n = n + 1
#             else:
#                 df_pca_con_train_fp = np.append(df_pca_con_train_fp, pca_train[i,j])
#                 df_pca_con_train_indices = np.append(df_pca_con_train_indices, np.array([i,j]))
# n = 0
# for i in range(len(valid_smiles)):
#     for j in range(2**reduced_qubits):
#         if pca_valid[i,j] != 0:
#             if n == 0:
#                 df_pca_con_valid_fp = pca_valid[i,j]
#                 df_pca_con_valid_indices = np.array([i,j], dtype=int)
#                 n = n + 1
#             else:
#                 df_pca_con_valid_fp = np.append(df_pca_con_valid_fp, pca_valid[i,j])
#                 df_pca_con_valid_indices = np.append(df_pca_con_valid_indices, np.array([i,j]))
# n = 0
# for i in range(len(test_smiles)):
#     for j in range(2**reduced_qubits):
#         if pca_test[i,j] != 0:
#             if n == 0:
#                 df_pca_con_test_fp = pca_test[i,j]
#                 df_pca_con_test_indices = np.array([i,j], dtype=int)
#                 n = n + 1
#             else:
#                 df_pca_con_test_fp = np.append(df_pca_con_test_fp, pca_test[i,j])
#                 df_pca_con_test_indices = np.append(df_pca_con_test_indices, np.array([i,j]))



# df_pca_con_train_fp = df_pca_con_train_fp.T
# df_pca_con_valid_fp = df_pca_con_valid_fp.T
# df_pca_con_test_fp = df_pca_con_test_fp.T
# df_pca_con_train_indices = df_pca_con_train_indices + 1
# df_pca_con_valid_indices = df_pca_con_valid_indices + 1
# df_pca_con_test_indices = df_pca_con_test_indices + 1
# df_pca_new_con_train_indices = np.reshape(df_pca_con_train_indices, (-1,2))
# df_pca_new_con_valid_indices = np.reshape(df_pca_con_valid_indices, (-1,2))
# df_pca_new_con_test_indices = np.reshape(df_pca_con_test_indices, (-1,2))
# df_train_input = np.zeros([len(train_smiles),1])
# df_valid_input = np.zeros([len(valid_smiles),1])
# df_test_input = np.zeros([len(test_smiles),1])

# newpath = f'/home/choyboy/Documents/Python/cuda/bace_dataset_topol_split/bace_pca_{num_qubits}q{reduced_qubits}'
# if not os.path.exists(newpath):
#     os.makedirs(newpath)

# np.savetxt(f'{newpath}/qc_pca_train_initial_bace_{num_qubits}q{reduced_qubits}', df_pca_con_train_fp, delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_train_initial_index_bace_{num_qubits}q{reduced_qubits}', df_pca_new_con_train_indices.astype(int), fmt='%s', delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_train_classes_bace_{num_qubits}q{reduced_qubits}', train_classes, fmt='%s', delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_train_input_bace_{num_qubits}q{reduced_qubits}', df_train_input, delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_valid_initial_bace_{num_qubits}q{reduced_qubits}', df_pca_con_valid_fp, delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_valid_initial_index_bace_{num_qubits}q{reduced_qubits}', df_pca_new_con_valid_indices.astype(int), fmt='%s', delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_valid_classes_bace_{num_qubits}q{reduced_qubits}', valid_classes, fmt='%s', delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_valid_input_bace_{num_qubits}q{reduced_qubits}', df_valid_input, delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_test_initial_bace_{num_qubits}q{reduced_qubits}', df_pca_con_test_fp, delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_test_initial_index_bace_{num_qubits}q{reduced_qubits}', df_pca_new_con_test_indices.astype(int), fmt='%s', delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_test_classes_bace_{num_qubits}q{reduced_qubits}', test_classes, fmt='%s', delimiter='\t')
# np.savetxt(f'{newpath}/qc_pca_test_input_bace_{num_qubits}q{reduced_qubits}', df_test_input, delimiter='\t')

In [11]:
reduced_qubits = 2
df_combined = np.concatenate([df_train_fp, df_valid_fp, df_test_fp], axis=0)
pca = PCA(n_components=reduced_qubits)
pca_combined = pca.fit_transform(df_combined)
pca_combined = normalize(pca_combined, norm='l2', axis=1)
pca_train = pca_combined[:train_num] * np.pi
pca_valid = pca_combined[train_num:train_num+valid_num] * np.pi
pca_test = pca_combined[train_num+valid_num:] * np.pi


newpath = f'/home/choyboy/Documents/Python/cuda/bace_dataset_topol_angle_split/bace_pca_{num_qubits}q{reduced_qubits}'
if not os.path.exists(newpath):
    os.makedirs(newpath)

np.savetxt(f'{newpath}/qc_pca_train_classes_bace_{num_qubits}q{reduced_qubits}', train_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_train_input_bace_{num_qubits}q{reduced_qubits}', pca_train, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_classes_bace_{num_qubits}q{reduced_qubits}', valid_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_valid_input_bace_{num_qubits}q{reduced_qubits}', pca_valid, delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_classes_bace_{num_qubits}q{reduced_qubits}', test_classes, fmt='%s', delimiter='\t')
np.savetxt(f'{newpath}/qc_pca_test_input_bace_{num_qubits}q{reduced_qubits}', pca_test, delimiter='\t')