In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install rdkit-pypi


Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
# import pandas as pd
# from rdkit import Chem
# from rdkit.Chem import AllChem

# # Load data
# data = pd.read_csv('/content/drive/MyDrive/BTP/chemprop_data.csv')

# # Function to get Morgan fingerprint as a list of bits
# def get_morgan_fingerprint_bits(smiles, radius=2, nBits=2048):
#     mol = Chem.MolFromSmiles(smiles)
#     if mol:
#         fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
#         return list(map(int, fingerprint.ToBitString()))
#     else:
#         return [0] * nBits  # Or handle missing values differently if desired

# # Generate fingerprints for each SMILES column
# data['fingerprint_1'] = data['canonical_smiles_1'].apply(lambda x: get_morgan_fingerprint_bits(x))
# data['fingerprint_2'] = data['canonical_smiles_2'].apply(lambda x: get_morgan_fingerprint_bits(x))

# # Expand fingerprints into individual columns for Chemprop
# fingerprint_1_df = pd.DataFrame(data['fingerprint_1'].tolist(), index=data.index)
# fingerprint_1_df.columns = [f'fingerprint_1_bit_{i}' for i in range(fingerprint_1_df.shape[1])]

# fingerprint_2_df = pd.DataFrame(data['fingerprint_2'].tolist(), index=data.index)
# fingerprint_2_df.columns = [f'fingerprint_2_bit_{i}' for i in range(fingerprint_2_df.shape[1])]

# # Drop the original list columns and join expanded bit columns
# data = data.drop(columns=['fingerprint_1', 'fingerprint_2'])
# descriptor_data = pd.concat([data, fingerprint_1_df, fingerprint_2_df], axis=1)

# # Save descriptor file for Chemprop
# descriptor_data.to_csv('/content/drive/MyDrive/BTP/descriptors_for_chemprop.csv', index=False)

# print(descriptor_data.head())

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

# Function to get Morgan fingerprint as a list of bits
def get_morgan_fingerprint_bits(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        return list(map(int, fingerprint.ToBitString()))
    else:
        return [0] * nBits  # Handle invalid SMILES with all-zeros

# Process data in chunks and save as npz file
data = pd.read_csv('/content/drive/MyDrive/BTP/chemprop_data.csv', chunksize=1000)

all_fingerprints = []  # Store all fingerprints here

for chunk in data:
    # Calculate fingerprints for each SMILES column in the chunk
    chunk['fingerprint_1'] = chunk['canonical_smiles_1'].apply(get_morgan_fingerprint_bits)
    chunk['fingerprint_2'] = chunk['canonical_smiles_2'].apply(get_morgan_fingerprint_bits)

    # Combine fingerprints
    combined_fingerprints = chunk.apply(lambda x: x['fingerprint_1'] + x['fingerprint_2'], axis=1)

    # Append combined fingerprints to the list
    all_fingerprints.extend(combined_fingerprints.tolist())

# Convert all fingerprints to numpy array
fingerprints_array = np.array(all_fingerprints)

# Save to npz file
np.savez_compressed('/content/drive/MyDrive/BTP/descriptors_for_chemprop.npz', arr_0=fingerprints_array)

print("Descriptor data saved in Chemprop-compatible .npz format.")




Descriptor data saved in Chemprop-compatible .npz format.


In [None]:
import numpy as np

# Load the .npz file
npz_file = np.load('/content/drive/MyDrive/BTP/descriptors_for_chemprop.npz')

# View the keys in the .npz file
print("Keys in the .npz file:", npz_file.files)

# Access and display the 'fingerprints' array
fingerprints_array = npz_file['fingerprints']
print("Shape of fingerprints array:", fingerprints_array.shape)

# Display the first few rows of the array
print("First few rows of fingerprints data:")
print(fingerprints_array[:5])


Keys in the .npz file: ['fingerprints']
Shape of fingerprints array: (170254, 2048)
First few rows of fingerprints data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


CVS to NPZ

In [None]:
import numpy as np
import pandas as pd

# Load the descriptors from the CSV
descriptors_df = pd.read_csv('/content/drive/MyDrive/BTP/descriptors_train.csv')

# Ensure all data is numeric (convert non-numeric columns to NaN and drop them)
numeric_descriptors_df = descriptors_df.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='any')

# Convert to NumPy array
descriptors_array = numeric_descriptors_df.to_numpy(dtype=np.float32)  # Ensure float type

# Save using np.savez to match the format expected by Chemprop
np.savez('/content/drive/MyDrive/BTP/descriptors_train.npz', arr_0=descriptors_array)
