# ðŸ§¬ BioDockify AI: ChemBERTa Training Pipeline

This notebook trains AI models for drug discovery using the **ChemBERTa** architecture.
It is part of the **ai.biodockify.com** zero-cost platform.

**Steps:**
1. Install Dependencies
2. Download Data from ChEMBL (e.g., Alzheimer's, Cancer)
3. Train Model (Free GPU)
4. Upload to Hugging Face

In [None]:
# Step 1: Install Dependencies
!pip install simpletransformers chembl_webresource_client pandas scikit-learn transformers

In [None]:
# Step 2: Login to Hugging Face (For Uploading)
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Step 3: Download Training Data (Alzheimer's - BACE1)
import pandas as pd
from chembl_webresource_client.new_client import new_client

def download_chembl_data(target_chembl_id, output_name):
    print(f"Downloading {output_name} ({target_chembl_id})...")
    activities = new_client.activity
    res = activities.filter(target_chembl_id=target_chembl_id).filter(standard_type="IC50")
    
    data = []
    for act in res:
        if act['standard_value'] and act['canonical_smiles']:
            data.append({
                'smiles': act['canonical_smiles'],
                'labels': 1 if float(act['standard_value']) < 1000 else 0  # Active < 1000nM
            })
            
    df = pd.DataFrame(data)
    df = df.drop_duplicates(subset=['smiles'])
    return df

# BACE1 for Alzheimer's
df = download_chembl_data("CHEMBL4822", "Alzheimers")
print(f"Downloaded {len(df)} compounds")
df.head()

In [None]:
# Step 4: Train ChemBERTa Model
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, test_size=0.2)

model_args = ClassificationArgs(
    num_train_epochs=3,
    overwrite_output_dir=True,
    use_early_stopping=True,
    save_steps=-1,
    train_batch_size=32
)

model = ClassificationModel(
    'roberta', 
    'seyonec/PubChem10M_SMILES_BPE_450k', 
    num_labels=2,
    args=model_args,
    use_cuda=True  # Will use Colab GPU
)

model.train_model(train_df)
result, model_outputs, wrong_predictions = model.eval_model(eval_df)
print(result)

In [None]:
# Step 5: Upload to Hugging Face
# REPLACE 'tajo9128' WITH YOUR USERNAME
repo_name = "biodockify-ai-alzheimers"

# Save locally first
model.save_model(repo_name)

# Upload
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path=repo_name,
    repo_id=f"tajo9128/{repo_name}",
    repo_type="model"
)
print(f"Uploaded to https://huggingface.co/tajo9128/{repo_name}")