In [1]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import argparse
from loguru import logger

import joblib
import tqdm

from datacat4ml.const import SPLIT_DATA_DIR

In [6]:
"""
Extract assay features like LSA. #Yu?
A list of GPCR assay ids is provided, and a Numpy dense matrix is created such that its i-th row contains the features of the i-th assay in the initial list.

For FS-Mol, we use the following columns:
`python encode_assay.py --assay_path=assay_info.parquet --encoding=clip --gpu=0 --columns \
assay_type_description description assay_category assay_cell_type assay_chembl_id assay_classification assay_organism assay_parameters assay_strain assay_subcellular_fraction assay_tax_id assay_test_type assay_tissue assay_type bao_format bao_label cell_chembl_id confidence_description confidence_score document_chembl_id relationship_description relationship_type src_assay_id src_id target_chembl_id tissue_chembl_id variant_sequence \
--suffix=all`
"""

'\nExtract assay features like LSA. #Yu?\nA list of GPCR assay ids is provided, and a Numpy dense matrix is created such that its i-th row contains the features of the i-th assay in the initial list.\n\nFor FS-Mol, we use the following columns:\n`python encode_assay.py --assay_path=assay_info.parquet --encoding=clip --gpu=0 --columns assay_type_description description assay_category assay_cell_type assay_chembl_id assay_classification assay_organism assay_parameters assay_strain assay_subcellular_fraction assay_tax_id assay_test_type assay_tissue assay_type bao_format bao_label cell_chembl_id confidence_description confidence_score document_chembl_id relationship_description relationship_type src_assay_id src_id target_chembl_id tissue_chembl_id variant_sequence --suffix=all`\n'

# clip_encode

In [7]:
#======================== clip_encode ========================
def clip_encode(list_of_assay_descriptions, gpu=0, batch_size=2048, truncate=True, verbose=True):
    """
    Encode a list of assay descriptions using a fitted Model.
    It is supposed to be called once.

    Params
    ------
    list_of_assay_descriptions: list of strings
        List of assay descriptions to be encoded.
    gpu: int
        Device to use for the CLIP model.
    batch_size: int
        Batch size to use for the CLIP model.
    truncate: bool
        default: True
        Whether to truncate the assay descriptions to 77 tokens (truncated from the end, because the beginning of a sentence often contains the most relevant information), the default setting for CLIP.
    
    Returns
    -------
    numpy.ndarray
        Numpy dense matrix with shape (n_assays, n_components). # n_components is the size of vector representation for each assay description.
    """
    import torch
    import clip
    device = f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu'
    logger.info(f'Load CLIP model on {device}.')
    model, preprocess = clip.load("ViT-B/32", device=device) # adopted from clip repo

    logger.info('Encode assay descriptions using CLIP.')
    with torch.no_grad(): 
        text_features = []
        for b in tqdm.tqdm(range(0, len(list_of_assay_descriptions), batch_size), desc='Encode assay descriptions', disable=not verbose):
            tokenized_text = clip.tokenize(list_of_assay_descriptions[b:min(b+batch_size, len(list_of_assay_descriptions))], truncate=truncate).to(device)
            tf = model.encode_text(tokenized_text)
            text_features.append(tf.cpu().detach().numpy()) # `.cpu()`: move the tensor from GPU to CPU; `.detach()`: no longer track the gradient; `.numpy()`: convert the tensor to a numpy array
    text_features = np.concatenate(text_features, axis=0)
    
    return text_features.astype(np.float32)

# lsa_encode

In [None]:
#======================== lsa_encode ========================
class Tokenizer:
    """
    Custom tokenizer combining ideas from `sklearn documentation` _ and from `this post`_.
    Requires the `nltk` package. Using part of speech (POS) tagsm ,makes it quite slow, but results seem cleaner.

    .. _`sklearn documentation`: https://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes
    .. _`this post`: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    """

    def __init__(self):
        import nltk
        self.nltk = nltk
        # check if resources are available
        try:
            nltk.word_tokenize('test')
        except LookupError:
            logger.info('Download nltk data.')
            nltk.download('punkt')
        
        try:
            from nltk.corpus import wordnet
            self.wordnet = wordnet
            wordnet.ADJ
            # check punkt
            self.nltk.word_tokenize('test')
        except LookupError:
            logger.info('Download wordnet corpus.')
            nltk.download('wordnet') # provides lexical info and POS compatibilityf for lemmatization
            nltk.download('punkt') # splits input text into words for tokenization
            nltk.download('averaged_perceptron_tagger') # assigns grammatical roles (POS tags) to words for context-aware processing.
            from nltk.corpus import wordnet
            self.wordnet = wordnet
        
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.tag2pos = {
            'J': self.wordnet.ADJ,
            'N': self.wordnet.NOUN,
            'V': self.wordnet.VERB,
            'R': self.wordnet.ADV,
        }
    
    def _get_wordnet_pos(self, word):
        """
        Map part of speech (POS) tag to first character lemmatize() accepts.
        If POS tag does not exist in `tag2pos`, return `wordnet.NOUN`.
        """
        tag = self.nltk.pos_tag([word])[0][1][0].upper()
        return self.tag2pos.get(tag, self.wordnet.NOUN)
    
    def __call__(self, doc):
        for word in self.nltk.word_tokenize(doc):
            pos = self._get_wordnet_pos(word)
            yield self.lemmatizer.lemmatize(word, pos=pos)

def lsa_fit(list_of_assay_descriptions, model_save_path='./data/models/lsa.joblib', n_components=355, verbose=True): #Yu? change the value of model_save_path later
    """
    Fit a sklearn TruncatedSVD model using a list of assay descriptions.

    Params
    ------
    list_of_assay_descriptions: list of strings
        List of assay descriptions to be encoded.
    model_save_path: str
        Path to save the fitted sklearn LSA model in joblib format.
    n_components: int
        Number of components to use for the TruncatedSVD model.
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    from sklearn.pipeline import Pipeline
    logger.info('Set up and fit-transform a sklearn TfidfVectorizer.')
    tok = Tokenizer()
    
    tfidf = TfidfVectorizer(
        strip_accents='unicode',
        analyzer='word',
        tokenizer=tok, #TODo use this
        stop_words='english',
        max_df=0.95,
        min_df =1 / 10000,
        dtype =np.float32
    )

    features = tfidf.fit_transform(list_of_assay_descriptions)
    logger.info(f'tfidf.vocabulary size: {len(tfidf.vocabulary_)}')
    if verbose:
        logger.info('Fit a sklearn TruncatedSVD model with {n_components} components.'.format(n_components))
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(features)

    model = Pipeline([('tfidf', tfidf), ('svd', svd)])
    if verbose:
        logger.info('Save the fitted model.')
    # check if model_save_path exists otherwise create it
    model_save_path = Path(model_save_path)
    model_save_path.parent.mkdir(parents=True, exist_ok=True) #Yu?
    joblib.dump(model, model_save_path)

    return model

def lsa_encode(list_of_assay_descriptions, lsa_path='', verbose=True):
    """
    Encode a list of assay descriptions using a fitted LSA model.

    Params
    ------
    list_of_assay_descriptions: list of strings
        List of assay descriptions to be encoded.
    lsa_path: str
        Path to a fitted sklearn LSA model in joblib format.
    n_components: int
        Number of components to use for the TruncatedSVD model.

    Returns
    -------
    numpy.ndarray
        Numpy dense matrix with shape (n_assays, n_components).
    """
    if verbose:
        logger.info('Load a fitted LSA model.')
    model = joblib.load(lsa_path)

    if verbose:
        logger.info('Encode assay descriptions using LSA.')
    features = model.transform(list_of_assay_descriptions)

    return features

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser('Computer features for a collection of GPCR assay descriptions.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--assay_path', default='assay_info.parquet', help='Path to a parquet file with assay index to AID for which to extract features.')
    parser.add_argument('-c', '--columns', nargs='+', help='Columns to use for the assay description. default: title and subtitle', default=['title', 'subtitle'])
    parser.add_argument('--suffix', help='Suffix to add to the output file.', default=None)
    parser.add_argument('--encoding', help='Encoding-type to use for the assay descriptions. Available are text, clamp, lsa, default:lsa', default='lsa') #Yu? change the value for default
    parser.add_argument('--lsa_path', help='Path to a fitted sklearn TfidfVectorizer+LSA model in joblib format, or where to save it if not present.', default='./data/models/lsa.joblib') #Yu? change the value for default
    parser.add_argument('--train_set_size', help='The ratio of assay descriptions for training the model to the whole dataset. range: 0-1, default: first 80%', default=0.8, type=float)
    parser.add_argument('--gpu', help='GPU number to use for a GPU-based encoding, if any. Default:0', default=0)
    parser.add_argument('--batch_size', help='Batch size to use for a GPU-based encoding. default: 2048', default=2048, type=int) #Yu? why default 2048
    parser.add_argument('--n_components', help='Number of components to use for the TruncatedSVD model. default:355', default=355, type=int) #Yu? why default 355
    args = parser.parse_args()

    df = pd.read_parquet(args.assay_path)
    path = Path(args.assay_path)

    # check if all columns are present
    if not all([c in df.columns for c in args.columns]):
        raise ValueError(f'Columns {args.columns} not found in the assay dataframe. Available columns: {df.columns}')
    df[args.columns] = df[args.columns].fillna('')  # fill NaN with empty string
    df[args.columns] = df[args.columns].astype(str)  # convert all columns to string

    list_of_assay_descriptions = df[args.columns].apply(
        lambda x: ' '.join([f"{col}:{val}" for col, val in x.items()]), axis=1).tolist()

    logger.info(f'example assay description: {list_of_assay_descriptions[0]}')

    if args.encoding == 'text':
        features = np.array(list_of_assay_descriptions)
    elif args.encoding == 'lsa':
        logger.info('Encode assay descriptions using LSA')
        # load model if the file exists
        if not Path(args.lsa_path).is_file():
            logger.info('Fit a sklearn TfidfVectorizer model on training data.')
            # lsa_save_path = path.with_name(f'assay_lsa_enc{"_"+args.suffix if args.suffix else ""}.joblib')
            logger.info(f'Save the fitted LSA-model to {args.lsa_path}, load it later using the argument --lsa_path')

            # Todo custom fit depending on training-set size
            train_set_size = args.train_set_size
            train_set_size = int(len(list_of_assay_descriptions)*train_set_size)
            logger.info(f'Fit on {train_set_size} train assay descriptions, {train_set_size/len(list_of_assay_descriptions)*100:.2f}% of the data.')
            model = lsa_fit(list_of_assay_descriptions[:int(train_set_size)], model_save_path=args.lsa_path)
        
        features = lsa_encode(list_of_assay_descriptions, args.lsa_path)

    elif args.encoding == 'clip':
        features = clip_encode(list_of_assay_descriptions, gpu=args.gpu, verbose=True, batch_size=args.batch_size)
    elif args.encoding == 'biobert': #Yu? remove it later
        raise NotImplementedError('Biobert encoding not implemented yet.') 
    else:
        raise ValueError(f'Encoding {args.encoding} not implemented')
    
    fn = path.with_name(f'assay_features_{args.encoding}{"_"+args.suffix if args.suffix else ""}.npy')
    np.save(fn, features)
    logger.info(f'Saved assay features to {fn}')

# Test on the fly

In [9]:
path = os.path.join(SPLIT_DATA_DIR, 'fsmol_alike', 'MHDsFold', 'assay_info.parquet')
df = pd.read_parquet(path)

In [None]:
columns = ['assay_idx', 'assay_chembl_id', 'assay_description']
df[columns] = df[columns].fillna('')  # fill NaN with empty string
df[columns] = df[columns].astype(str)  # convert all columns to string
df[columns]

Unnamed: 0,assay_idx,assay_chembl_id,assay_description
0,0,CHEMBL832882,Inhibitory concentration against human MCH-R1-...
1,1,CHEMBL832411,Inhibition of melanin concentrating hormone re...
2,2,CHEMBL832852,Tested for MCH-1 induced [Ca2+] release from C...
3,3,CHEMBL835733,Concentration required to inhibit 50% of melan...
4,4,CHEMBL835734,Concentration required to inhibit 50% of melan...
...,...,...,...
18287,18287,CHEMBL5055009,"Displacement of [3H]U69,593 from human kappa o..."
18288,18288,CHEMBL753328,Inhibitory activity against Opioid receptor ka...
18289,18289,CHEMBL4829174,Inhibition of kappa opioid receptor (unknown o...
18290,18290,CHEMBL5058731,OPRK1 Eurofins SafetyScan (Receptor)


In [13]:
list_of_assay_descriptions = df[columns].apply(
    lambda x: ' '.join([f"{col}:{val}" for col, val in x.items()]), axis=1).tolist()
list_of_assay_descriptions 

['assay_idx:0 assay_chembl_id:CHEMBL832882 assay_description:Inhibitory concentration against human MCH-R1-stimulated [Ca2+] influx',
 'assay_idx:1 assay_chembl_id:CHEMBL832411 assay_description:Inhibition of melanin concentrating hormone receptor 1-mediated [Ca2+] release in human neuronal IMR-32 cells',
 'assay_idx:2 assay_chembl_id:CHEMBL832852 assay_description:Tested for MCH-1 induced [Ca2+] release from CHO cells transfected with human MCH-1R',
 'assay_idx:3 assay_chembl_id:CHEMBL835733 assay_description:Concentration required to inhibit 50% of melanin-concentrating hormone induced [Ca2+] flux in IMR-32 cells measured by using a fluorometric imaging plate reader',
 'assay_idx:4 assay_chembl_id:CHEMBL835734 assay_description:Concentration required to inhibit 50% of melanin-concentrating hormone induced [Ca2+] flux in IMR-32 cells measured by using a fluorometric imaging plate reader',
 'assay_idx:5 assay_chembl_id:CHEMBL832647 assay_description:Antagonistic activity against human 