# 3.0 - Final Model Training (HuggingFace)

Fine-tune a transformer model on combined processed datasets.

In [None]:
# Cell 1 - imports & config load
import yaml
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

CONFIG = Path('configs/training_config.yaml')
if CONFIG.exists():
    cfg = yaml.safe_load(open(CONFIG))
else:
    cfg = {'model':{'hf_model':'bert-base-multilingual-cased','max_length':256}, 'data':{'test_size':0.1}, 'training_args':{'num_train_epochs':1,'per_device_train_batch_size':8,'per_device_eval_batch_size':8,'evaluation_strategy':'epoch'}}
print(cfg)

In [None]:
# Cell 2 - dataset assembly helper
from glob import glob

def load_processed_data(folder='data/processed'):
    files = glob(folder + '/*.csv')
    dfs = []
    for f in files:
        df = pd.read_csv(f)
        if 'clean_text' not in df.columns:
            text_cols = [c for c in df.columns if 'text' in c.lower() or 'content' in c.lower()]
            if text_cols:
                df['clean_text'] = df[text_cols[0]].astype(str)
            else:
                continue
        if 'label' not in df.columns:
            continue
        dfs.append(df[['clean_text','label']])
    if not dfs:
        return pd.DataFrame(columns=['clean_text','label'])
    return pd.concat(dfs, ignore_index=True)

full = load_processed_data()
print('combined rows', len(full))

In [None]:
# Cell 3 - dataset to HF dataset
from datasets import Dataset
if len(full) == 0:
    print("No processed data found. Skipping HF training cells.")
else:
    full = full.dropna()
    hf = Dataset.from_pandas(full)
    hf = hf.train_test_split(test_size=cfg['data'].get('test_size', 0.1))
    print("HF dataset prepared.")

In [None]:
# Cell 4 - tokenizer & model prep
if len(full) > 0:
    model_name = cfg['model']['hf_model']
    num_labels = len(sorted(full['label'].unique()))
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(batch):
        return tokenizer(batch['clean_text'], truncation=True, padding='max_length', max_length=cfg['model']['max_length'])

    hf = hf.map(tokenize, batched=True)
    hf = hf.remove_columns(['clean_text'])
    print("Tokenization complete.")

In [None]:
# Cell 5 - training (lightweight)
if len(full) > 0:
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    training_args = TrainingArguments(**cfg['training_args'])

    def compute_metrics(eval_pred):
        import numpy as np
        preds, labels = eval_pred
        preds = np.argmax(preds, axis=1)
        from sklearn.metrics import f1_score, accuracy_score
        return {
            'accuracy': accuracy_score(labels, preds),
            'f1': f1_score(labels, preds, average='weighted')
        }

    trainer = Trainer(model=model, args=training_args, train_dataset=hf['train'], eval_dataset=hf['test'], tokenizer=tokenizer, compute_metrics=compute_metrics)
    trainer.train()
    trainer.save_model('models/final/transformer_model')
    print('Saved transformer model to models/final/transformer_model')