In [None]:
!pip install wandb -qqq
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msasou_moussa[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
!pip install rdkit-pypi -qqq

In [None]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from rdkit import Chem
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data, Batch, download_url
import numpy as np
import os.path as osp
from torch_geometric.loader import DataLoader
from tqdm import tqdm

In [None]:
def get_bin_feature(r):
    '''
    This function is used to generate Adjacency Matrix
    '''
    rmol = Chem.MolFromSmiles(r)
    n_atoms = rmol.GetNumAtoms()
    
    
   
    index=[]
    for atom1 in rmol.GetAtoms():
       for atom2 in rmol.GetAtoms():
            i= atom1.GetIntProp('molAtomMapNumber') -1
            j=atom2.GetIntProp('molAtomMapNumber') -1
            
            idx=[]
            idx.append(i)
            idx.append(j)
           
            index.append(idx)

    index = np.asarray(index)
    index = torch.tensor(index)
    index = index.t().to(torch.long).view(2, -1)

    return index


In [None]:
bo_to_index  = {0.0: 0, 1:1, 2:2, 3:3, 1.5:4}
nbos = len(bo_to_index)
INVALID_BOND = -1

def get_bond_label(react, product):
    '''
    This function is used to generate label vector
    '''
    rmol = Chem.MolFromSmiles(react)
    pmol = Chem.MolFromSmiles(product)
    n_atoms = rmol.GetNumAtoms()
    rmap = np.zeros((n_atoms, n_atoms, nbos))
    
    for atom1 in rmol.GetAtoms():
        for atom2 in rmol.GetAtoms():
            
            i= atom1.GetIntProp('molAtomMapNumber') -1
            j=atom2.GetIntProp('molAtomMapNumber') -1
            rmap[i,j,0] = rmap[j,i,0] = 1
            
  
    for bond in pmol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            bo= bond.GetBondTypeAsDouble() 
            rmap[i,j,0] = rmap[j,i,0] = 0
            z = bo_to_index[float(bo)]
            rmap[i,j,z] = rmap[j,i,z] = 1

    labels = np.reshape(rmap,(n_atoms*n_atoms,nbos))
    
    return labels

In [None]:
class MoleculeDataset(Dataset):
  
    def __init__(self, root, filename, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data).
        """
        self.filename = filename
        super(MoleculeDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):

       return self.filename

    @property
    def processed_file_names(self):

       return 'not_implemented.pt'


    def download(self):
        pass

    def process(self):
        
        self.data = open(self.raw_paths[0], "r")
        idx = 0
       
        for line in self.data:

            r, e = line.strip("\r\n ").split()
            react = r.split('>')[0]
            product = r.split('>')[1]
            edits = get_bond_label(react,product)
            edge_index = get_bin_feature(react)

            mol_obj = Chem.MolFromSmiles(react)
            # Get node features
            node_feats = self._get_node_features(mol_obj)
            # Get edg features
            edge_weight = self._get_edge_weights(mol_obj)
            
            # Get labels info
            label = self._get_labels(edits)
            
            # Create data object
            data = Data(x=node_feats,
                        edge_index=edge_index,
                        edge_weight=edge_weight,
                        y=label,
                        smiles=react
                        )

            torch.save(data, osp.join(self.processed_dir, f'data_{idx}.pt'))
            print(f"data n°{idx} saved")
            idx += 1

    def _get_node_features(self, mol):
        """
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        n_atoms = mol.GetNumAtoms()
        all_node_feats = np.zeros((n_atoms, 7))

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Explicit Valence
            node_feats.append(atom.GetExplicitValence())
            # Feature 4: Implicit Valence
            node_feats.append(atom.GetImplicitValence())
            # Feature 5: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 6: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 7: In Ring
            node_feats.append(atom.IsInRing())

            # Append node features to matrix
            all_node_feats[ atom.GetIntProp('molAtomMapNumber') -1] = node_feats

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_weights(self, mol):
        """
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        n_atoms = mol.GetNumAtoms()
        edge_weight = np.zeros((n_atoms, n_atoms))

        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_weight[i,j]=bond.GetBondTypeAsDouble()        

        edge_weight = np.asarray(edge_weight)
        edge_weight = torch.tensor(edge_weight)
        
        return edge_weight


    def _get_labels(self, e):
     
      label = np.asarray(e)
      return torch.tensor(label)

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """

        data = torch.load(osp.join(self.processed_dir,
                                           f'data_{idx}.pt'))


        return data



In [None]:
Dataset= MoleculeDataset(root="/content/drive/MyDrive/dataset/USPTO_Test",filename="Test.txt")

Processing...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
data n°35000 saved
data n°35001 saved
data n°35002 saved
data n°35003 saved
data n°35004 saved
data n°35005 saved
data n°35006 saved
data n°35007 saved
data n°35008 saved
data n°35009 saved
data n°35010 saved
data n°35011 saved
data n°35012 saved
data n°35013 saved
data n°35014 saved
data n°35015 saved
data n°35016 saved
data n°35017 saved
data n°35018 saved
data n°35019 saved
data n°35020 saved
data n°35021 saved
data n°35022 saved
data n°35023 saved
data n°35024 saved
data n°35025 saved
data n°35026 saved
data n°35027 saved
data n°35028 saved
data n°35029 saved
data n°35030 saved
data n°35031 saved
data n°35032 saved
data n°35033 saved
data n°35034 saved
data n°35035 saved
data n°35036 saved
data n°35037 saved
data n°35038 saved
data n°35039 saved
data n°35040 saved
data n°35041 saved
data n°35042 saved
data n°35043 saved
data n°35044 saved
data n°35045 saved
data n°35046 saved
data n°35047 saved
data n°35048 saved
data

Done!


In [None]:
Test_Dataset= MoleculeDataset(root="/content/drive/MyDrive/dataset/essay",filename="test.txt")

Processing...


data n°0 saved
data n°1 saved
data n°2 saved
data n°3 saved
data n°4 saved
data n°5 saved
data n°6 saved
data n°7 saved
data n°8 saved
data n°9 saved
data n°10 saved
data n°11 saved
data n°12 saved
data n°13 saved
data n°14 saved
data n°15 saved
data n°16 saved
data n°17 saved
data n°18 saved
data n°19 saved
data n°20 saved
data n°21 saved
data n°22 saved
data n°23 saved
data n°24 saved
data n°25 saved
data n°26 saved
data n°27 saved
data n°28 saved
data n°29 saved
data n°30 saved
data n°31 saved
data n°32 saved
data n°33 saved
data n°34 saved
data n°35 saved
data n°36 saved
data n°37 saved
data n°38 saved
data n°39 saved
data n°40 saved
data n°41 saved
data n°42 saved
data n°43 saved
data n°44 saved
data n°45 saved
data n°46 saved
data n°47 saved
data n°48 saved
data n°49 saved
data n°50 saved
data n°51 saved
data n°52 saved
data n°53 saved
data n°54 saved
data n°55 saved
data n°56 saved
data n°57 saved
data n°58 saved
data n°59 saved
data n°60 saved
data n°61 saved
data n°62 saved
da

Done!
