In [1]:
import pandas as pd
df = pd.read_csv('Final_Populism.csv')
df = df[['Summary', 'Original_Text', 'Article_Title','Is_Populist']]


In [2]:

import os, random, numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model
import evaluate

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

MODEL_NAME = "facebook/bart-base"   
MAX_SRC = 1024
MAX_SUM = 128
DROP_TITLE_PROB = 0.5  
USE_LORA = True


df['Is_Populist'] = df['Is_Populist'].astype(int).clip(0,1)

def make_prompt(title, text, may_drop=False):
    use_title = bool(title) and (not may_drop or random.random() > DROP_TITLE_PROB)
    prefix = (title.strip() + ". ") if use_title else ""
    return (prefix + (text or "")).strip()

def row_to_examples(row, may_drop=False, upsample_pos=True, pos_factor=2):
    exs = []

    x_sum = "summarize: " + make_prompt(row['Article_Title'], row['Original_Text'], may_drop=may_drop)
    exs.append({"task":"summarize", "input": x_sum, "target": str(row["Summary"]).strip()})

    x_cls = "classify_populism: " + make_prompt(row['Article_Title'], row['Original_Text'], may_drop=may_drop)
    y = int(row["Is_Populist"])
    cls_ex = {"task":"classify", "input": x_cls, "target": str(y)}
    exs.append(cls_ex)

    if upsample_pos and y == 1 and pos_factor > 1:
        for _ in range(pos_factor - 1):
            exs.append(cls_ex.copy())

    return exs

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Is_Populist'])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['Is_Populist'])

# TRAIN: title-dropout + positive upsampling
train_examples = [ex
    for _, r in train_df.iterrows()
    for ex in row_to_examples(r, may_drop=True, upsample_pos=True, pos_factor=2)  # try 2; tune to 3–4 if recall still 0
]

# VAL/TEST: NO TITLE and NO upsampling (mirror deployment)
def row_to_examples_no_title(row):
    return [
        {"task":"summarize", "input": "summarize: " + (row['Original_Text'] or ""), "target": str(row["Summary"]).strip()},
        {"task":"classify",  "input": "classify_populism: " + (row['Original_Text'] or ""), "target": str(int(row["Is_Populist"]))}
    ]

val_examples  = [ex for _, r in val_df.iterrows()  for ex in row_to_examples_no_title(r)]
test_examples = [ex for _, r in test_df.iterrows() for ex in row_to_examples_no_title(r)]


train_ds = Dataset.from_list(train_examples)
val_ds   = Dataset.from_list(val_examples)
test_ds  = Dataset.from_list(test_examples)

val_sum  = val_ds.filter(lambda ex: ex["task"]=="summarize")
val_cls  = val_ds.filter(lambda ex: ex["task"]=="classify")
test_sum = test_ds.filter(lambda ex: ex["task"]=="summarize")
test_cls = test_ds.filter(lambda ex: ex["task"]=="classify")


tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tok.padding_side = "right"
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token  


base = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,       
    attn_implementation="eager"
).to("cuda" if torch.cuda.is_available() else "cpu")

# model = get_peft_model(base, LoraConfig(task_type="SEQ_2_SEQ_LM", r=16, lora_alpha=32, lora_dropout=0.05)) if USE_LORA else base

lora_cfg = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","out_proj","fc1","fc2"],
    modules_to_save=["lm_head"], 
)

model = get_peft_model(base, lora_cfg)

# collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model, return_tensors="pt")
        
def tokenize_fn(batch):
    ins = tok(
        batch["input"],
        truncation=True,
        max_length=MAX_SRC,
        padding=False,                 # <— was "max_length"
    )
    labs = tok(
        text_target=batch["target"],
        truncation=True,
        max_length=MAX_SUM,
        padding=False,                 # <— was "max_length"
    )
    ins["labels"] = labs["input_ids"]
    return ins

collator = DataCollatorForSeq2Seq(
    tokenizer=tok,
    model=model,
    return_tensors="pt",
    pad_to_multiple_of=8               # <— keeps fp16-friendly padding
)



train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
val_sum_tok  = val_sum.map(tokenize_fn, batched=True, remove_columns=val_sum.column_names)
val_cls_tok  = val_cls.map(tokenize_fn, batched=True, remove_columns=val_cls.column_names)
test_sum_tok = test_sum.map(tokenize_fn, batched=True, remove_columns=test_sum.column_names)
test_cls_tok = test_cls.map(tokenize_fn, batched=True, remove_columns=test_cls.column_names)






model.gradient_checkpointing_enable()
model.config.use_cache = False
model.train()

rouge = evaluate.load("rouge")
acc   = evaluate.load("accuracy"); prec = evaluate.load("precision")
rec   = evaluate.load("recall");   f1   = evaluate.load("f1")

def _decode(pred_ids, label_ids):
    pred_ids  = np.where(pred_ids  != -100, pred_ids,  tok.pad_token_id)
    label_ids = np.where(label_ids != -100, label_ids, tok.pad_token_id)
    preds  = tok.batch_decode(pred_ids,  skip_special_tokens=True)
    labels = tok.batch_decode(label_ids, skip_special_tokens=True)
    return preds, labels

def summarize_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    preds, labels = _decode(pred_ids, label_ids)
    r = rouge.compute(predictions=preds, references=labels, use_stemmer=True)
    return {f"rouge_{k}": v for k,v in r.items()}

def classify_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    preds, labels = _decode(pred_ids, label_ids)
    preds_bin  = [1 if (p.strip() and p.strip()[0]=='1') else 0 for p in preds]
    labels_bin = [1 if (l.strip() and l.strip()[0]=='1') else 0 for l in labels]
    return {
        "accuracy":  acc.compute(predictions=preds_bin, references=labels_bin)["accuracy"],
        "precision": prec.compute(predictions=preds_bin, references=labels_bin)["precision"],
        "recall":    rec.compute(predictions=preds_bin, references=labels_bin)["recall"],
        "f1":        f1.compute(predictions=preds_bin, references=labels_bin)["f1"],
    }

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    output_dir="bart_multitask_lora",
    learning_rate=2e-4,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=32,
    gradient_accumulation_steps=4,
    weight_decay=0.01,

    fp16=True, bf16=False,            
    max_grad_norm=1.0,
    label_smoothing_factor=0.1,

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_rouge_rougeLsum",
    greater_is_better=True,
    save_total_limit=2,

    predict_with_generate=True,
    generation_num_beams=3,
    # generation_max_new_tokens=MAX_SUM,
    lr_scheduler_type="cosine",      
    warmup_ratio=0.06,
    generation_max_length=MAX_SUM,

    logging_steps=50,
    dataloader_pin_memory=True,
    gradient_checkpointing=True,
    # optim="adamw_bnb_8bit",
    optim="adamw_torch_fused",
    report_to="none",
    torch_compile=False,
    dataloader_num_workers=2,
    group_by_length=True
)


trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_sum_tok,
    # processing_class=tok,    
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=summarize_metrics
)

trainer.train()
print("Best metric:", trainer.state.best_metric)
print("Best checkpoint:", trainer.state.best_model_checkpoint)

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Map:   0%|          | 0/19023 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge Rouge1,Rouge Rouge2,Rouge Rougel,Rouge Rougelsum
1,4.6809,4.26763,0.219664,0.071383,0.159296,0.159171
2,4.3645,4.132442,0.298937,0.093362,0.21248,0.2125
3,4.2584,4.094854,0.306981,0.099511,0.216608,0.216348
4,4.2191,4.088968,0.306655,0.098356,0.216579,0.216506




Best metric: 0.21650597924602138
Best checkpoint: bart_multitask_lora/checkpoint-1192


In [3]:
from transformers.utils import logging
logging.set_verbosity_error()  
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import GenerationConfig
sum_gc = GenerationConfig(
    do_sample=False, num_beams=5,
    max_new_tokens=MAX_SUM,
    length_penalty=1.1,         
    no_repeat_ngram_size=3,     
    min_new_tokens=16            
)


cls_gc = GenerationConfig(do_sample=False, num_beams=1, max_new_tokens=2)

trainer.compute_metrics = summarize_metrics

print("Validation – Summarization (no-title):",
      trainer.evaluate(val_sum_tok,  metric_key_prefix="sum_val",  generation_config=sum_gc))
print("Test – Summarization (no-title):",
      trainer.evaluate(test_sum_tok, metric_key_prefix="sum_test", generation_config=sum_gc))


def eval_cls_argmax_first_token(ds, model, tok, collator, batch_size=32):
    device = next(model.parameters()).device
    id0 = tok("0", add_special_tokens=False)["input_ids"][0]
    id1 = tok("1", add_special_tokens=False)["input_ids"][0]
    dec_start = model.config.decoder_start_token_id

    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collator)
    preds, refs = [], []
    model.eval()
    with torch.no_grad():
        for batch in dl:
            input_ids = batch["input_ids"].to(device)
            attn      = batch["attention_mask"].to(device)
            dec_inp   = torch.full((input_ids.size(0), 1), dec_start, dtype=torch.long, device=device)
            logits    = model(input_ids=input_ids, attention_mask=attn,
                              decoder_input_ids=dec_inp, use_cache=False).logits[:, -1, :]
            pred = (logits[:, id1] > logits[:, id0]).long().cpu().tolist()
            preds.extend(pred)

            lab0 = batch["labels"][:, 0].clone()
            lab0[lab0 == -100] = tok.pad_token_id
            ref = (lab0 == id1).long().cpu().tolist()
            refs.extend(ref)

    return {
        "accuracy":  accuracy_score(refs, preds),
        "precision": precision_score(refs, preds, zero_division=0),
        "recall":    recall_score(refs, preds, zero_division=0),
        "f1":        f1_score(refs, preds, zero_division=0),
        "pred_pos_rate": float(sum(preds))/len(preds)
    }

print("Validation – Classification (no-title):", eval_cls_argmax_first_token(val_cls_tok,  model, tok, collator))
print("Test – Classification (no-title):",       eval_cls_argmax_first_token(test_cls_tok, model, tok, collator))

@torch.no_grad()
def predict_summary_and_label(text: str):
    device = next(model.parameters()).device
    x = text.strip()

    # summarization
    in_sum = tok("summarize: " + x, return_tensors="pt",
                 truncation=True, max_length=MAX_SRC).to(device)
    out_sum = model.generate(**in_sum, num_beams=5, max_new_tokens=MAX_SUM, do_sample=False)
    summary = tok.decode(out_sum[0], skip_special_tokens=True).strip()

    # classification: first-token argmax
    in_cls = tok("classify_populism: " + x, return_tensors="pt",
                 truncation=True, max_length=MAX_SRC).to(device)
    dec_start = model.config.decoder_start_token_id
    dec_inp = torch.tensor([[dec_start]], device=device)
    logits = model(**in_cls, decoder_input_ids=dec_inp, use_cache=False).logits[:, -1, :]
    id0 = tok("0", add_special_tokens=False)["input_ids"][0]
    id1 = tok("1", add_special_tokens=False)["input_ids"][0]
    is_pop = int((logits[0, id1] > logits[0, id0]).item())

    return summary, is_pop

trainer.save_model("final_adapter")  
tok.save_pretrained("final_adapter")

if USE_LORA:
    merged = model.merge_and_unload()
else:
    merged = model
merged.save_pretrained("final_merged", safe_serialization=True)
tok.save_pretrained("final_merged")




Validation – Summarization (no-title): {'sum_val_loss': 4.089061260223389, 'sum_val_rouge_rouge1': 0.30888842019002904, 'sum_val_rouge_rouge2': 0.09873294813504559, 'sum_val_rouge_rougeL': 0.21685772552524682, 'sum_val_rouge_rougeLsum': 0.21691444485423914, 'sum_val_runtime': 6265.6342, 'sum_val_samples_per_second': 0.158, 'sum_val_steps_per_second': 0.02, 'epoch': 4.0}
Test – Summarization (no-title): {'sum_test_loss': 4.069072723388672, 'sum_test_rouge_rouge1': 0.3086700392636904, 'sum_test_rouge_rouge2': 0.0965638598713156, 'sum_test_rouge_rougeL': 0.21795977488581375, 'sum_test_rouge_rougeLsum': 0.21797753855264893, 'sum_test_runtime': 6249.2813, 'sum_test_samples_per_second': 0.158, 'sum_test_steps_per_second': 0.02, 'epoch': 4.0}
Validation – Classification (no-title): {'accuracy': 0.12828282828282828, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'pred_pos_rate': 0.8717171717171717}
Test – Classification (no-title): {'accuracy': 0.1191919191919192, 'precision': 0.0, 'recall': 0.0,



('final_merged/tokenizer_config.json',
 'final_merged/special_tokens_map.json',
 'final_merged/vocab.json',
 'final_merged/merges.txt',
 'final_merged/added_tokens.json',
 'final_merged/tokenizer.json')