In [None]:
import os
import csv
import base64
import json
import pandas as pd
from rdkit import Chem
from rdkit.DataStructs import TanimotoSimilarity, ExplicitBitVect
from rdkit.Chem import AllChem
from concurrent.futures import ThreadPoolExecutor, as_completed

local_dir = './input_files'
output_dir = './similarity_scores'
os.makedirs(local_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

def decode_fingerprint(encoded_fingerprint):
    try:
        rle_string = base64.b64decode(encoded_fingerprint).decode('utf-8')
        rle = json.loads(rle_string)
        decoded = []
        for char, count in rle:
            decoded.extend([char] * count)
        return decoded
    except (UnicodeDecodeError, ValueError) as e:
        print(f"Error decoding fingerprint: {e}")
        return None

def list_to_bitvect(fp_list, nBits=2048):
    bitvect = ExplicitBitVect(nBits)
    for i in range(len(fp_list)):
        if fp_list[i] == 1:
            bitvect.SetBit(i)
    return bitvect

def compute_tanimoto(fp1, fp2):
    return TanimotoSimilarity(fp1, fp2)

def compute_morgan_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    else:
        return None

def process_single_fingerprint(fp_csv, df_pg, chembl_id_csv):
    similarity_scores = []
    for _, row in df_pg.iterrows():
        chembl_id_pg = row['chembl_id']
        fp_pg = row['morgan_fingerprint']
        score = compute_tanimoto(fp_csv, fp_pg)
        similarity_scores.append((chembl_id_csv, chembl_id_pg, score))
    return similarity_scores

def process_and_compute_similarity(local_dir):
    df_pg_list = []
    for i in range(1, 25):
        fingerprints_file = f'../morgan_fingerprints/encoded_fingerprints/encoded_fingerprints_batch_{i}.csv'
        df_pg = pd.read_csv(fingerprints_file)
        df_pg['morgan_fingerprint'] = df_pg['morgan_fingerprint'].apply(
            lambda x: list_to_bitvect(decode_fingerprint(x))
        )
        df_pg_list.append(df_pg)

    df_pg = pd.concat(df_pg_list, ignore_index=True)
    df_pg = df_pg.dropna(subset=['morgan_fingerprint'])

    for file_name in os.listdir(local_dir):
        if file_name.startswith('data_') and file_name.endswith('.csv'):
            file_path = os.path.join(local_dir, file_name)
            fingerprints = []

            with open(file_path, mode='r', encoding='utf-8', errors='replace') as f:
                reader = csv.reader(f)
                headers = next(reader)
                for row in reader:
                    chembl_id = row[0]
                    smiles = row[1]
                    fingerprint = compute_morgan_fingerprint(smiles)
                    if fingerprint:
                        fingerprints.append((chembl_id, fingerprint))

            with ThreadPoolExecutor(max_workers=8) as executor:
                future_to_fp = {
                    executor.submit(
                        process_single_fingerprint,
                        fp_csv,
                        df_pg,
                        chembl_id_csv
                    ): chembl_id_csv for chembl_id_csv, fp_csv in fingerprints
                }
                for future in as_completed(future_to_fp):
                    chembl_id_csv = future_to_fp[future]
                    try:
                        similarity_scores = future.result()
                        similarity_df = pd.DataFrame(
                            similarity_scores,
                            columns=[
                                'source_chembl_id',
                                'target_chembl_id',
                                'similarity_score']
                        )
                        parquet_file_path = os.path.join(
                            output_dir,
                            f'similarity_scores_{chembl_id_csv}.parquet'
                        )
                        similarity_df.to_parquet(
                            parquet_file_path, index=False
                        )

                    except Exception as e:
                        print(f"Error processing {chembl_id_csv}: {e}")

process_and_compute_similarity(local_dir)
