# Setup

In [1]:
# Standard library imports
import os
import math
import nest_asyncio
nest_asyncio.apply()

# Third-party imports
import pandas as pd
import requests
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoTokenizer
import lmdeploy
from lmdeploy import GenerationConfig, PytorchEngineConfig

# Local imports

# Constants
CUR_DIR = os.path.dirname(os.path.realpath('__file__'))
MODEL_NAME = 'meta-llama/Meta-Llama-3.1-8B-Instruct'

# Set device
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

# Get model and move to GPU
pipe = lmdeploy.pipeline(
    MODEL_NAME,
    backend_config=PytorchEngineConfig(
        device_type='cuda'
    )
)

# Set generation configuration
gen_config = GenerationConfig(temperature=0, max_new_tokens=1024)

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

# Check if Paper's Have Exact Value from Chembl

In [None]:
df = pd.read_json(f'{CUR_DIR}/data/regression.json')
print(df.head())

In [None]:
temp_df = df.sample(n=1)
value = temp_df['value'].values[0]
doi = temp_df['doi'].values[0]
print('Value: ', value)
print('DOI: ', doi)

txt_path = f'/data/rbg/users/vincentf/data_uncertainty/c340_txt/{doi.replace("/", "_")}.txt'
with open(txt_path, 'r', encoding='utf-8') as file:
    text = file.read()

response = pipe([
    f'''    
    You are tasked with pulling data from the literature provided. 
    The text will contain key experimental details like assay type, inhibitors, and conditions.
    We only want the text pertraining to the IC50 assay of digestive enzyme Cytochrome P450 3A4 (CYP3A4).
    
    The output should only include details present in the text; do not infer or add anything.

    There is no need to add references.

    INPUT:
    {text}
    
    OUTPUT:
    '''
])

In [None]:
print(response[0].text)

# Chunk Data and See if It Contains Assay Information

In [9]:
# Select random input
df = pd.read_json(f'{CUR_DIR}/data/regression.json')
df['assay_summary'] = None

for index, row in df.iterrows():
    try:
        value = row['value']
        doi = row['doi']
        print('Value: ', value)
        print('DOI: ', doi)
        print('Description: ', row['description'], '\n')

        
        # Pull chembl description to directly compare
        
        # Get input text
        txt_path = f'/data/rbg/users/vincentf/data_uncertainty/c340_txt/{doi.replace("/", "_")}.txt'
        with open(txt_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        response = pipe([
            f'''   
            Tell me:
                1. The experimental conditions of the assay (e.g., substrate, substrate concentration, probe type, incubation time, buffer conditions).
                2. The method used to determine the IC50 value (e.g., type of assay, detection method).
                3. Any other relevant details about how the IC50 was measured.
            
            For the following text:
            {text}
            
            Given the Cytochrome P450 (CYP) 3A4 IC50 measurement that produced the result:
            {value} uM (might be mentioned in different units).            
            '''
        ])
        row['assay_summary'] = response[0].text
        print('Response: ', response[0].text, '\n')
        break
    except Exception as e:
        pass

# Add text extractions to the JSON


Value:  5000.0
DOI:  10.1021/acs.jmedchem.5b01146
Description:  Inhibition of C-terminal four-histidine tagged human CYP3A4delta3 to 24 residues expressed in Escherichia coli assessed as 7-benzyloxy-4-(trifluoromethyl)coumarin O-debenzylation pretreated 2 mins followed by NADPH addition by fluorometric method in presence of rat cytochrome P450 reductase 

Value:  4000.0
DOI:  10.1021/acs.jmedchem.5b01146
Description:  Inhibition of C-terminal four-histidine tagged human CYP3A4delta3 to 24 residues expressed in Escherichia coli assessed as 7-benzyloxy-4-(trifluoromethyl)coumarin O-debenzylation pretreated 2 mins followed by NADPH addition by fluorometric method in presence of rat cytochrome P450 reductase 

Value:  1000.0
DOI:  10.1021/acs.jmedchem.5b01146
Description:  Inhibition of C-terminal four-histidine tagged human CYP3A4delta3 to 24 residues expressed in Escherichia coli assessed as 7-benzyloxy-4-(trifluoromethyl)coumarin O-debenzylation pretreated 2 mins followed by NADPH addit

In [None]:
# Load the texts

# Get text embeddings (llama, bert)

# Add embeddings to chemprop regression model & evaluate performance

In [45]:
# Select random input
df = pd.read_json(f'{CUR_DIR}/data/regression.json')

# Try until document is retrieved

    try:        
        temp_df = df.sample(n=1)
        value = temp_df['value'].values[0]
        doi = temp_df['doi'].values[0]
        print('Value: ', value)
        print('DOI: ', doi)
        
        # Get input text
        txt_path = f'/data/rbg/users/vincentf/data_uncertainty/c340_txt/{doi.replace("/", "_")}.txt'
        with open(txt_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        response = pipe([
            f'''   
            For the following text:
            {text}
            
            Given the Cytochrome P450 (CYP) 3A4 IC50 measurement that produced the result:
            {value} uM (might be mentioned in different units).
        
            I am looking for only the following information:
            1. The experimental conditions of the assay (e.g., substrate concentration, probe type, incubation time, buffer conditions).
            2. The method used to determine the IC50 value (e.g., type of assay, detection method).
            3. Any other relevant details about how the IC50 was measured.
            '''
        ])
        error_happened = False
print(response[0].text)

IndentationError: unexpected indent (2687463195.py, line 5)

In [3]:
# Select random input
df = pd.read_json(f'{CUR_DIR}/data/regression.json')
temp_df = df.sample(n=1)
value = temp_df['value'].values[0]
doi = temp_df['doi'].values[0]
value = 0.11
doi = '10.1016/j.bmcl.2009.07.118'
print('Value: ', value)
print('DOI: ', doi)

# Get input text
txt_path = f'/data/rbg/users/vincentf/data_uncertainty/c340_txt/{doi.replace("/", "_")}.txt'
with open(txt_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Split text (optional)
def split_text(text, chunk_size=1024):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    
# Tokenize data
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Split tokens
def tokenize_text(text, chunk_size=1024):
    tokens = tokenizer.encode(text, return_tensors='pt')
    return [tokens[:, i:i + chunk_size] for i in range(0, tokens.size(1), chunk_size)]

responses = []
chunks = tokenize_text(text) # split_text(text)
for i, chunk in enumerate(chunks):
    decoded_text = tokenizer.decode(chunk[0], skip_special_tokens=True)
    response = pipe([
        f'''    
        Extract text from the following passage that discusses the inhibition of CYP3A4 in relation to its IC50 value. 
        The output should only include sentences that mention specific values or items related to IC50. 
        Specifically, only related to those studies that produced the IC50 value {value} uM.
        
        Please return the results in a bullet-point format. Do not include an introduction or references.
        
        {decoded_text}
        '''
    ])
    responses.append(response[0].text)

responses_text = ' '.join(responses)
response = pipe([
    f'''    
    Summarize the following text. 
    Make sure that it only contains information relevant to the IC50 assay that produced {value} uM for CYP3A4.
    
    Please return the results in a bullet-point format. Do not include an introduction or references.
    
    {responses_text}
    '''
])
print(response[0].text)

Value:  0.11
DOI:  10.1016/j.bmcl.2009.07.118
Here are the results in bullet-point format:

* The IC50 value of 0.11 uM was determined for the inhibition of CYP3A4 by a specific compound.
* The IC50 value of 0.11 uM was reported for the inhibition of CYP3A4.
* The IC50 value of compound 1 was 0.11 μM.
* The IC50 value of compound 2 was 0.11 μM.
* The IC50 value of compound 16 was 0.11 μM.
* The IC50 value of compound 17 was 0.11 μM.
* The IC50 value of compound 18 was 0.11 μM.
* The IC50 value of 0.11 uM was not mentioned in the passage, but the IC50 value of RTV is not mentioned, but the Ki value of RTV is mentioned: "RTV is also a potent inhibitor of CYP3A (Ki = 3.2 nM)"
* The IC50 value of the novel CYP inhibitors is mentioned: "These compounds were evaluated for their abilities to inhibit the CYP3A enzyme with IC50 values ranging from 0.11 uM to 1.1 uM."
* Potent analogs in this series inhibit CYP3A with IC50 <0.1 μM, comparing favorably to inhibition by RTV (IC50 = 0.05 μM).
* Two

# Good Example

In [None]:
temp_df = df.sample(n=1)
value = temp_df['value'].values[0]
doi = temp_df['doi'].values[0]
print('Value: ', value)
print('DOI: ', doi)

txt_path = f'/data/rbg/users/vincentf/data_uncertainty/c340_txt/{doi.replace("/", "_")}.txt'
with open(txt_path, 'r', encoding='utf-8') as file:
    text = file.read()
    
def split_text(text, chunk_size=1024):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

chunks = split_text(text)
for i, text in enumerate(chunks):
    response = pipe([
        f'''    
        Please tell me whether or not the following text contains information regarding the IC50 of CYP 3A4 (Cytochrome P450).
        The information should be related to the value provided: {value} uM.
        Use 'yes' or 'no' as the output.
        
        INPUT:
        {text}
        
        OUTPUT:
        '''
    ])
    if response[0].text.lower() == 'yes':
        response = pipe([
            f'''    
            You are tasked with pulling data from the literature provided. 
            The text will contain key experimental details like assay type, inhibitors, and conditions.
            We only want the text pertraining to the IC50 assay with value: {value} uM for digestive enzyme Cytochrome P450 3A4 (CYP3A4).
                    
            The output should only include details present in the text; do not infer or add anything.
        
            There is no need to add references.

            If you cannot find the information, output 'no'.
        
            INPUT:
            {text}
            
            OUTPUT:
            '''
        ])

        if 'no' not in response[0].text.lower():
            print(response[0].text)