In [None]:
!source activate cs336_data
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  
import pandas as pd
import numpy as np
import wandb
import shutil  
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset 
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding 
)


MODEL_CHECKPOINT = "roberta-large" 
MAX_LEN = 512
CACHE_DIR = "./hf_cache" 
OUTPUT_DIR = "./temp_sweep_storage" 

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

LABEL_MAP = {
    "Llama-3.1": 0, "Qwen-2.5": 1, "Mistral-v0.3": 2, "Granite-3.1": 3, "GLM-4": 4
}
num_labels = len(LABEL_MAP)

df_train = pd.read_csv("../all_domain_data/train_combined.csv") 
df_val   = pd.read_csv("../all_domain_data/val_combined.csv")

def clean_and_prep(df):
    if 'text' in df.columns: df.rename(columns={'text': 'source_text'}, inplace=True)
    if 'summary' in df.columns: df.rename(columns={'summary': 'text'}, inplace=True)
    
    df = df.dropna(subset=['text']).copy()
    df['text'] = df['text'].astype(str)
    df = df[df['text'].str.strip() != ""].copy()
    
    def encode(model_name):
        for key, val in LABEL_MAP.items():
            if key.split("-")[0].lower() in str(model_name).lower(): return val
        return -1
    
    df['label'] = df['model'].apply(encode)
    df = df[df['label'] != -1].copy()
    return df

df_train_sweep = clean_and_prep(df_train)
df_val_sweep   = clean_and_prep(df_val)



#Tokenize
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, cache_dir=CACHE_DIR)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LEN)

# Convert to HF Datasets
hf_train_sweep = Dataset.from_pandas(df_train_sweep)
hf_val_sweep   = Dataset.from_pandas(df_val_sweep)

# Tokenize only the sweep data
sweep_train_dataset = hf_train_sweep.map(preprocess_function, batched=True)
sweep_val_dataset   = hf_val_sweep.map(preprocess_function, batched=True)

# Sweep config
sweep_config = {
    'method': 'bayes',
    'metric': { 'name': 'accuracy', 'goal': 'maximize' },
    'early_terminate': { 'type': 'hyperband', 'min_iter': 1 },
    'parameters': {
        'learning_rate': { 
            'min': 3.85e-5, 
            'max': 1e-4 
        },
        'weight_decay': { 
            'values': [0.15, 0.2, 0.25] 
        },
        'batch_size': { 
            'values': [16,32, 64] 
        },
        'warmup_ratio': { 
            'values': [ 0.1,0.15,0.2] 
        },
        
        'epochs': { 
            'values': [3, 5,7] 
        }
    }
}

#train
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    return {"accuracy": acc, "f1": f1}

def train_sweep():
    wandb.init(project="LLM-Authorship-Attribution-Sweep3")
    config = wandb.config

    #Setup Temp Dir
    run_output_dir = f"{OUTPUT_DIR}/{wandb.run.name}"
    print(f"--> Temp Dir: {run_output_dir}")

    #Load Model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT, 
        num_labels=num_labels,
        cache_dir=CACHE_DIR 
    )

    # Define Args
    training_args = TrainingArguments(
        output_dir=run_output_dir,
        report_to="wandb", 
        save_total_limit=1,        
        load_best_model_at_end=True,
        learning_rate=config.learning_rate,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size * 2,
        weight_decay=config.weight_decay,
        num_train_epochs=config.epochs,
        bf16=True, 
        tf32=True, 
        dataloader_num_workers=4,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model="accuracy",
        logging_steps=50,
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sweep_train_dataset, 
        eval_dataset=sweep_val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f" >> Run Finished. Accuracy: {metrics['eval_accuracy']:.4f}")

    # Clean up disk so save memory
    try:
        shutil.rmtree(run_output_dir)
        print("--> Cleanup successful.")
    except Exception as e:
        print(f"Error deleting folder: {e}")

 

In [None]:
# start sweep
sweep_id = wandb.sweep(sweep_config, project="LLM-Authorship-Attribution-Sweep3")
print("Starting Sweep Agent")
wandb.agent(sweep_id, function=train_sweep, count=10)

print("Sweep Complete. Check W&B Dashboard.")