In [None]:
import numpy as np
import pandas as pd

In [None]:
df_smiles = pd.read_csv("main_dataset_2.csv")
df_smiles.rename(columns={'SMILES': 'SMILES', 'IC50(nM)': 'IC50'}, inplace=True)

In [None]:
df_smiles = df_smiles[df_smiles['IC50'] != 'Not active'].copy()

In [None]:
def convert_to_micro(value):
    # Check if the value is a string
    if isinstance(value, str):
        value = value.strip()  # Remove whitespace
        if value.startswith(">") or value.startswith("<"):
            # Extract numeric part, convert to float, divide by 1000, and reattach the symbol
            return value[0] + str(float(value[1:]) / 1000)
        else:
            # Try to convert to float for plain numbers
            try:
                return str(float(value) / 1000)
            except ValueError:
                return value  # Return as-is if not convertible
    # If it's numeric, directly convert
    elif isinstance(value, (int, float)):
        return value / 1000
    # Return as-is for unexpected formats
    return value

# Apply the conversion function to the ic50 column
df_smiles['IC50'] = df_smiles['IC50'].apply(convert_to_micro)

In [None]:
def encode_ic50(value, threshold):
    """
    Encodes IC50 values into categories: 'hERG Blocker', 'Non Blocker', or None.

    Parameters:
    value (str or numeric): IC50 value to categorize.
    threshold: inactivity threshold to classify as herg blocker or non blocker

    Returns:
    str or None: Categorized result ('hERG Blocker', 'Non Blocker', or None).
    """
    # Check if the value is a string
    if isinstance(value, str):
        value = value.strip()  # Remove any leading/trailing whitespace
        if value.startswith(">"):
            try:
                numeric_value = float(value[1:])  # Remove '>' and convert to float
                return 'Non Blocker'
            except ValueError:
                return None
        elif value.startswith("<"):
            try:
                numeric_value = float(value[1:])  # Remove '<' and convert to float
                return 'hERG Blocker'
            except ValueError:
                return None
        else:
            # Try to convert to float if it's a plain number in string format
            try:
                numeric_value = float(value)
                if numeric_value <= 10:
                    return 'hERG Blocker'
                elif numeric_value >= threshold:
                    return 'Non Blocker'
                else:
                    return None  # Discard other values
            except ValueError:
                return None  # Handle unexpected formats
    # Handle numeric input directly
    elif isinstance(value, (int, float)):
        if value <= 10:
            return 'hERG Blocker'
        elif value >= threshold:
            return 'Non Blocker'
        else:
            return None  # Discard other values
    # Return None for unsupported types
    return None

# Apply the function to create a new column
df_smiles['IC50'] = df_smiles['IC50'].apply(lambda x: encode_ic50(x, threshold))

In [None]:
# Check for duplicates in the 'SMILES' column
duplicates = df_smiles[df_smiles.duplicated(subset='SMILES', keep=False)]  # Show all duplicates
if not duplicates.empty:
    print("Duplicate rows based on 'SMILES' column:")
    print(duplicates)
else:
    print("No duplicates found in the 'SMILES' column.")
df_smiles = df_smiles.dropna(subset=["IC50"])