# 00_core.ipynb

## Base classes for RXN fingerprints

In [2]:
# hide
from nbdev.showdoc import *

In [3]:
# export
from abc import ABC, abstractmethod
from typing import List

### Container

In [4]:
# export
class RXNFingerprintContainer(ABC):
    """
    Base class for RXN Fingerprint Container
    - Should facilitate the creation of a LSHForest
    - Query nearest neighbors
    """

    @abstractmethod
    def add_finferprint(self, fingerprint: List, aux_info: List):
        """
        Add fingerprint to container
        """
    
    @abstractmethod
    def save(self, save_path:str):
        """
        Convert batch of rxn_smiles to fingerprints
        """
    
    @abstractmethod
    def load(self, load_path:str):
        """
        Load fingerprints from file
        """
    
    @abstractmethod
    def query_nearest_neighbors(self, fingerprint):
        """ 
        Query nearest neighbors
        """
# to use this class, you would create a concrete subclass tha implements the abstract methods
# `class MyRXNFingerprintContainer(RXNFingerprintContainer):`

### FingerprintGenerator

In [6]:
# export
class FingerprintGenerator(ABC):
    """
    base class for FingerprintGenerator
    """

    @abstractmethod
    def convert(self, rxn_smiles: str) -> List[float]:
        """
        Convert rxn_smiles to fingerprint
        """
    
    @abstractmethod
    def convert_batch(self, rxn_smiles_batch: List[str]) -> List:
        """
        Convert batch of rxn_smiles to fingerprints
        """

# 01_tokenization.ipynb

> Tokenization and SMILES utils

In [7]:
# hide 
%load_ext autoreload
%autoreload 2
# By running this code at the beginning of your Jupyter Notebook, 
# you'll ensure that any changes you make to your Python modules will be reflected in your notebook without requiring manual reloading. 
# This can save time and effort during development, especially when working on iterative tasks.

In [8]:
# hide
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

# use IpythonConsole for pretty drawings
from rdkit.Chem.Draw import IPythonConsole

In [12]:
# export
import collections
import logging
import os
import re
import numpy as np
from rdkit import Chem

import pkg_resources

from typing import List

from transformers import BertTokenizer

## Reaction SMILES tokenizer

We use the same tokenizer as the one used for http://rxnmapper.ai.

It is an instance of Hugginface Transformers BertTokenizer.

In [13]:
#export
SMI_REGEX_PATTERN =  r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"

def get_default_tokenizer():
    default_vocab_path = (
        pkg_resources.resource_filename(
                    "rxnfp",
                    "models/transformers/bert_ft_10k_25s/vocab.txt"
                )
    )
    return SmilesTokenizer(default_vocab_path, do_lower_case=False)
