# Imports

In [1]:
# Standard library imports
import os
import math
from pathlib import Path

# Third-party imports
import sqlite3
import pandas as pd
from lightning import pytorch as pl
import torch
from chemprop import data, featurizers, models, nn
from chembl_webresource_client.new_client import new_client

# Local imports

# CUDA
print(f"CUDA available: {torch.cuda.is_available()}")
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

CUDA available: True


# Constants

In [2]:
# Directory
CUR_DIR = os.path.dirname(os.path.realpath('__file__'))

# Cytochrome P450 3A4 IDs
CYP3A4_CHEMBL_ID = 'CHEMBL340' # ['CHEMBL2111472', 'CHEMBL2364675', 'CHEMBL4523986']
CHEMBL_DB_PATH = '/data/rbg/users/vincentf/data_uncertainty/chembl_34/chembl_34/chembl_34_sqlite/chembl_34.db'

# ChEMBL Database

In [48]:
# Connect to chembl database
conn = sqlite3.connect(CHEMBL_DB_PATH, timeout = 10)
cur = conn.cursor()

# Value types to extract
value_types = ['IC50', 'IC5', 'Log IC50', 'pIC50', 'log(1/IC50)', '-Log IC50(M)', 'Ratio IC50', 'IC50(app)']
placeholders = ','.join(['?'] * len(value_types))



# Execute call
cur.execute(query, (CYP3A4_CHEMBL_ID, *value_types))

<sqlite3.Cursor at 0x7f04ad11b540>

In [None]:
# Query
query = f'''
    SELECT 
        targets.pref_name AS target_pref_name,
        targets.chembl_id AS target_chembl_id,
        activities.assay_id AS assay_id,
        assays.chembl_id AS assay_chembl_id, 
        activities.standard_type AS value_type,
        activities.standard_relation AS value_relation,
        activities.standard_value AS value,
        activities.standard_units AS value_units,
        activities.molregno AS molregno,
        assays.doc_id AS doc_chembl_id, 
        docs.doi AS doc_doi
    FROM 
        activities
    JOIN 
        assays ON activities.assay_id = assays.assay_id
    JOIN 
        docs ON assays.doc_id = docs.doc_id
    JOIN 
        target_dictionary AS targets ON assays.tid = targets.tid
    WHERE 
        targets.chembl_id = ?
        AND targets.target_type = 'SINGLE PROTEIN'
        AND activities.standard_type IN ({placeholders})
        AND activities.standard_value != 0
        AND activities.standard_value IS NOT NULL
    ORDER BY 
        activities.assay_id, value DESC
'''

# Connect to chembl databse
conn = sqlite3.connect(CHEMBL_DB_PATH, timeout=10)
conn.row_factory = sqlite3.Row
cur = conn.cursor()

# Execute query
cur.execute(query, (CYP3A4_CHEMBL_ID, *value_types))

# Fetch rows from the database
rows = cur.fetchall()

In [49]:
# Get column names (keys) from one of the rows and print the first row
if rows:
    # Get the column names
    keys = rows[0].keys()
    
    # Print the column names (keys)
    print("Column names:", keys)
    
    # Print the first row as a dictionary
    print("First row:", dict(rows[0]))

('Cytochrome P450 3A4', 'CHEMBL340', 4333, 'CHEMBL883800', 'IC50', '=', 1260, 'nM', 255904, 11347, '10.1021/jm00093a015')


<class 'list'>


In [None]:
# Get all available items in new_client
available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]
all_data = {}
for resource in ['activity', 'document']:
    print(f'Resource: {resource}...')
    if resource not in all_data:
        all_data[resource] = []
        
    # Get caller for resource
    caller = getattr(new_client, resource)
    
    # Fetch data for the specified chembl_id
    caller_data = []
    for chembl_id in CYP3A4_CHEMBL_IDS:
        try:
            caller_data.extend(
                caller.filter(
                    target_chembl_id=chembl_id
                )
            )
        except Exception as e:
            print(f"Error fetching data from {resource} for ID {chembl_id}: {e}")
    
    # Append the data to the resource's list in all_data
    all_data[resource].append(pd.DataFrame(caller_data))
    
pd.DataFrame(all_data).to_json(f'{CUR_DIR}/data/cyp3a4_data.json', orient='records')

Resource: activity...
Resource: document...
