In [None]:
import os
import pandas as pd
import psycopg2
from rdkit import Chem
from rdkit.Chem import AllChem


db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT', '5432')
}


conn = psycopg2.connect(**db_params)

query = ("SELECT chembl_id, canonical_smiles "
         "FROM compound_structures "
         "WHERE canonical_smiles IS NOT NULL;")

df = pd.read_sql_query(query, conn)

conn.close()


def compute_morgan_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprint(mol, 2, n_bits=2048)
        fp_dict = {int(k): int(v) for k, v in fp.GetNonzeroElements().items()}
        return fp_dict
    else:
        return None
    
batch_size = 100000
num_batches = len(df) // batch_size + (1 if len(df) % batch_size > 0 else 0)
output_dir = './fingerprints'
os.makedirs(output_dir, exist_ok=True)

for batch_num in range(num_batches):
    batch_df = df[batch_num * batch_size:(batch_num + 1) * batch_size]
    batch_df['morgan_fingerprint'] = batch_df['canonical_smiles'].apply(
        lambda x: compute_morgan_fingerprint(x)
    )

    batch_df = batch_df.dropna(subset=['morgan_fingerprint'])
    
    csv_file_path = os.path.join(
        output_dir, f"fingerprints_batch_{batch_num+1}.csv"
    )
    batch_df[['chembl_id', 'morgan_fingerprint']].to_csv(
        csv_file_path, index=False
    )
    print(f"Saved batch {batch_num+1} to {csv_file_path}")

In [None]:
import os
import csv
import base64
import json

# https://stackoverflow.com/questions/72051723/rle-algorithm-in-python
def run_length_encode(input_list):
    encoding = []
    prev_char = input_list[0]
    count = 1

    for char in input_list[1:]:
        if char == prev_char:
            count += 1
        else:
            encoding.append((prev_char, count))
            prev_char = char
            count = 1
    encoding.append((prev_char, count))
    return encoding

def encode_fingerprint(fingerprint):
    rle = run_length_encode(fingerprint)
    rle_string = json.dumps(rle)
    base64_encoded = base64.b64encode(
        rle_string.encode('utf-8')).decode('utf-8'
    )
    return base64_encoded

input_dir = './fingerprints'
output_dir = './encoded_fingerprints'
os.makedirs(output_dir, exist_ok=True)

for input_file_name in os.listdir(input_dir):
    input_file_path = os.path.join(input_dir, input_file_name)
    output_file_path = os.path.join(output_dir, f"encoded_{input_file_name}")
    
    with open(
        input_file_path, mode='r'
    ) as input_file, open(
        output_file_path, mode='w', newline=''
    ) as output_file:
        
        reader = csv.reader(input_file)
        writer = csv.writer(output_file)
        headers = next(reader)
        writer.writerow(headers)
        
        for row in reader:
            chembl_id = row[0]
            fingerprint = json.loads(row[1])
            encoded_fingerprint = encode_fingerprint(fingerprint)
            writer.writerow([chembl_id, encoded_fingerprint])
        
        print(f"Encoded fingerprints saved to {output_file_path}")


In [None]:
import boto3
import os

bucket_name = 'de-school-2024-aws'
s3_prefix = 'final_task/valentin_krivolutskii/fingerprints/'

s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID', ''),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY', ''),
    region_name=os.getenv('AWS_REGION', '')
)


def upload_to_s3(file_path, bucket, prefix):
    s3.upload_file(
        file_path, bucket, os.path.join(prefix, os.path.basename(file_path))
    )
    print(f"Uploaded {file_path} to s3://{bucket}/{prefix}")

for file_name in os.listdir(output_dir):
    file_path = os.path.join(output_dir, file_name)
    upload_to_s3(file_path, bucket_name, s3_prefix)