In [1]:
# conda env: datacat (Python 3.8.20)

In [None]:
import os

import csv
from datacat4ml.const import CURA_CAT_GPCR_DIR

from collections import defaultdict
import pandas as pd
from datacat4ml.Scripts.data_prep.data_curate.utils.apply_thresholds import apply_thresholds

# Read MHDs from `CURA_CAT_GPCR_DIR`

In [3]:
# Put the file name of all the CSV files in the CURA_CAT_GPCR_DIR directory into one list.
MHDs = [f for f in os.listdir(os.path.join(CURA_CAT_GPCR_DIR, 'cls')) if f.startswith('CHEMBL')] 
print(f'len(MHDs): {len(MHDs)}')

# get the unique target_chembl_id in the list of MHDs
MHDs_tgt = list(set([x.split('_')[0] for x in MHDs]))
print(f'The unique number of targets in MHDs: {len(MHDs_tgt)}')

len(MHDs): 935
The unique number of targets in MHDs: 238


In [4]:
MHDs_min32 = []

for assay in MHDs:
    assay_path = os.path.join(CURA_CAT_GPCR_DIR, 'cls', assay)
    
    # Count rows using CSV reader for accurate results
    with open(assay_path, 'r', newline='') as f:
        row_count = sum(1 for row in csv.reader(f))
    
    # Subtract 1 for header row (same as pd.read_csv() behavior)
    if row_count - 1 >= 32:
        MHDs_min32.append(assay)

print(f'len(MHDs_min32): {len(MHDs_min32)}')

len(MHDs_min32): 573


# Generate LHDs

In [None]:
LHDs_dict = defaultdict(list)
LHDs_dir = os.path.join(CURA_CAT_GPCR_DIR, 'cls', 'LHDs')
os.makedirs(LHDs_dir, exist_ok=True)

for assay in MHDs_min32:
    assay_path = os.path.join(CURA_CAT_GPCR_DIR, 'cls', assay)
    
    try:
        # Read the CSV file
        assay_df = pd.read_csv(assay_path).drop(columns=['Unnamed: 0'])

        # Get counts and filter valid IDs
        id_counts = assay_df['assay_chembl_id'].value_counts()
        # the number of data points in a single assay should,on the one hand, be at least 32 to ensure the model can be trained;
        # on the other hand, should not exceed 5000 to avoid high-throughput screens, as these are generally considered noisy
        valid_ids = id_counts[(id_counts >= 32) & (id_counts <= 5000)].index.tolist()

        if not valid_ids:
            continue

        # Save the valid IDs to a new CSV file
        for id in valid_ids:
            df = assay_df[assay_df['assay_chembl_id'] == id]
            # delete the old threshold column
            df = df.drop(columns=['threshold', 'activity_string','activity'])
            # apply thresholds again because the new data may have different thresholds
            df = apply_thresholds(df)

            # Save to CSV
            base_name = os.path.splitext(assay)[0]
            output_fname = f'{base_name}_{id}.csv'
            output_path = os.path.join(LHDs_dir, output_fname)
            df.to_csv(output_path, index=False)

            # Add to dictionary
            LHDs_dict[base_name].append(id)
    
    except Exception as e:
        print(f'Error processing {assay}: {e}')

Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thresholds 
Applying thre

In [7]:
print(f'len(LHDs_dict): {len(LHDs_dict)}') # => 1531 csv files generated in the output_dir.

len(LHDs_dict): 401


In [10]:
# exclude the assay that contains less than 32 data points
LHDs = [f for f in os.listdir(os.path.join(CURA_CAT_GPCR_DIR, 'cls', 'LHDs')) if f.startswith('CHEMBL')] 
LHDs_min32 = []

for assay in LHDs:
    assay_path = os.path.join(CURA_CAT_GPCR_DIR, 'cls', 'LHDs', assay)
    
    # Count rows using CSV reader for accurate results
    with open(assay_path, 'r', newline='') as f:
        row_count = sum(1 for row in csv.reader(f))
    
    # Subtract 1 for header row (same as pd.read_csv() behavior)
    if row_count - 1 >= 32:
        LHDs_min32.append(assay)

print(f'len(LHDs_min32): {len(LHDs_min32)}')

len(LHDs_min32): 1531
