In [1]:
import pandas as pd
from rdkit import Chem
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [2]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

In [3]:
df = pd.read_csv('/home/dreamtim/Coding/ITMO/itmo-cpp/output_data/all_peptides_with_smiles.csv', index_col=0)
df.columns

Index(['id', 'sequence', 'extra_name', 'cpp_category', 'is_cpp', 'cpp_type',
       'origin', 'id_uptake', 'peptide', 'uptake_type', 'raw_efficiency',
       'raw_toxicity', 'raw_concentration', 'id_experiment',
       'peptide_experiment', 'raw_time', 'method', 'cell_line', 'cargo',
       'mechanism', 'raw_temperature', 'id_article', 'doi', 'pubmed_id',
       'title', 'sequence_category', 'standard_sequence', 'nh3_tail',
       'po3_pos', 'biotinylated', 'acylated_n_terminal', 'cyclic', 'amidated',
       'stearyl_uptake', 'hexahistidine_tagged', 'modifications',
       'smiles_sequence'],
      dtype='object')

In [4]:
# Filter nans and nulls
df = df[['smiles_sequence', 'is_cpp']]
df = df.dropna()
df

Unnamed: 0,smiles_sequence,is_cpp
0,CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[...,True
3,CCC(C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC...,True
7,CC(C)C[C@@H](NC(=O)[C@@H](C)NC(=O)[C@@H](CCCCN...,True
8,CC(C)C[C@@H](NC(=O)[C@@H](CS)NC(=O)[C@@H](CC(C...,True
9,C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](...,True
...,...,...
18348,CC(C)C[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H...,True
18349,NCCCC[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(...,True
18350,CCCCCCCCCCCCCCCC(=O)N[C@@H](C)C(=O)NCC(=O)N[C@...,True
18351,CCCCCCCCCCCCCCCC(=O)N[C@@H](C)C(=O)NCC(=O)N[C@...,True


In [5]:
# Filter valid SMILES and create binary labels
data = df[['smiles_sequence', 'is_cpp']].dropna().rename(columns={
    'smiles_sequence': 'smiles',
    'is_cpp': 'label'
}).query('smiles != ""').copy()

In [6]:
# Basic preprocessing
def validate_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

In [7]:
data['valid'] = data['smiles'].apply(validate_smiles)
data = data[data['valid']].drop(columns=['valid'])

In [8]:
# Remove duplicates and balance classes
data = data.drop_duplicates(subset=['smiles'])
data = data.groupby('label').sample(n=min(data['label'].value_counts()), random_state=42)

In [9]:
def formatting_input(current_query, history):
    input_text = ''
    for idx, (query, answer) in history:
        input_text += f"[Round {idx}]\nHuman: {query}\nAssistant: {answer}\n"
    input_text += f"[Round {len(history)}]\nHuman: {current_query}\nAssistant:"
    return input_text

In [10]:
# Format for LLM instruction tuning
def format_instruction(row):
    prompt = ("Generate a cell-penetrating peptide in SMILES format" 
              if row['label'] else 
              "Generate a non-cell-penetrating peptide in SMILES format")
    return {
        'text': formatting_input(prompt, history=[]) + " " + row['smiles']
    }

# Create HuggingFace dataset
dataset = Dataset.from_pandas(data).map(format_instruction)

Map:   0%|          | 0/2642 [00:00<?, ? examples/s]

In [11]:
# Train-test split
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_data = train_test['train']
val_data = train_test['test']

In [12]:
# Tokenization
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='pt'
    )

In [14]:
model_name_or_id = "OpenDFM/ChemDFM-13B-v1.0"
tokenizer = LlamaTokenizer.from_pretrained(model_name_or_id)
model = LlamaForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [16]:
input_text = "Can you please give detailed descriptions of the molecule below?\nCl.O=C1c2c(O)cccc2-c2nn(CCNCCO)c3ccc(NCCNCCO)c1c23"
input_text = f"[Round 0]\nHuman: {input_text}\nAssistant:"

inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
generation_config = GenerationConfig(
    do_sample=True,
    top_k=20,
    top_p=0.9,
    temperature=0.9,
    max_new_tokens=1024,
    repetition_penalty=1.05,
    eos_token_id=tokenizer.eos_token_id
)

In [17]:
outputs = model.generate(**inputs, generation_config=generation_config)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(input_text):]
print(generated_text.strip())

KeyboardInterrupt: 

In [None]:
train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)