In [1]:
import pandas as pd
df = pd.read_csv('Final_Populism.csv')
df = df[['Summary', 'Original_Text', 'Article_Title','Is_Populist']]


In [2]:

import os, random, numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model
import evaluate

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

MODEL_NAME = "facebook/bart-base"   
MAX_SRC = 1024
MAX_SUM = 128
DROP_TITLE_PROB = 0.5  
USE_LORA = True


df['Is_Populist'] = df['Is_Populist'].astype(int).clip(0,1)

def make_prompt(title, text, may_drop=False):
    use_title = bool(title) and (not may_drop or random.random() > DROP_TITLE_PROB)
    prefix = (title.strip() + ". ") if use_title else ""
    return (prefix + (text or "")).strip()

def row_to_examples(row, may_drop=False):
    x_sum = "summarize: " + make_prompt(row['Article_Title'], row['Original_Text'], may_drop=may_drop)
    x_cls = "classify_populism: " + make_prompt(row['Article_Title'], row['Original_Text'], may_drop=may_drop)
    return [
        {"task":"summarize", "input": x_sum, "target": row["Summary"]},
        {"task":"classify",  "input": x_cls, "target": str(int(row["Is_Populist"]))}
    ]

def row_to_examples_no_title(row):
    x_sum = "summarize: " + (row['Original_Text'] or "")
    x_cls = "classify_populism: " + (row['Original_Text'] or "")
    return [
        {"task":"summarize", "input": x_sum, "target": str(row["Summary"]).strip()},
        {"task":"classify",  "input": x_cls, "target": str(int(row["Is_Populist"]))}
    ]
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Is_Populist'])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['Is_Populist'])

train_examples = [ex for _, r in train_df.iterrows() for ex in row_to_examples(r, may_drop=True)]


val_examples   = [ex for _, r in val_df.iterrows()  for ex in row_to_examples_no_title(r)]
test_examples  = [ex for _, r in test_df.iterrows() for ex in row_to_examples_no_title(r)]

train_ds = Dataset.from_list(train_examples)
val_ds   = Dataset.from_list(val_examples)
test_ds  = Dataset.from_list(test_examples)

val_sum  = val_ds.filter(lambda ex: ex["task"]=="summarize")
val_cls  = val_ds.filter(lambda ex: ex["task"]=="classify")
test_sum = test_ds.filter(lambda ex: ex["task"]=="summarize")
test_cls = test_ds.filter(lambda ex: ex["task"]=="classify")


tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tok.padding_side = "right"
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token  

def tokenize_fn(batch):
    ins = tok(
        batch["input"], truncation=True, max_length=MAX_SRC, padding="max_length",
    )
    labs = tok(
        text_target=batch["target"], truncation=True, max_length=MAX_SUM, padding="max_length",
    )
    ins["labels"] = labs["input_ids"]    # collator will convert pad -> -100
    return ins


train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
val_sum_tok  = val_sum.map(tokenize_fn, batched=True, remove_columns=val_sum.column_names)
val_cls_tok  = val_cls.map(tokenize_fn, batched=True, remove_columns=val_cls.column_names)
test_sum_tok = test_sum.map(tokenize_fn, batched=True, remove_columns=test_sum.column_names)
test_cls_tok = test_cls.map(tokenize_fn, batched=True, remove_columns=test_cls.column_names)


base = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,       
    attn_implementation="eager"
).to("cuda" if torch.cuda.is_available() else "cpu")

# model = get_peft_model(base, LoraConfig(task_type="SEQ_2_SEQ_LM", r=16, lora_alpha=32, lora_dropout=0.05)) if USE_LORA else base

lora_cfg = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","out_proj","fc1","fc2"],
    modules_to_save=["lm_head"], 
)
model = get_peft_model(base, lora_cfg)

collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model, return_tensors="pt")
        

model.gradient_checkpointing_enable()
model.config.use_cache = False
model.train()

rouge = evaluate.load("rouge")
acc   = evaluate.load("accuracy"); prec = evaluate.load("precision")
rec   = evaluate.load("recall");   f1   = evaluate.load("f1")

def _decode(pred_ids, label_ids):
    pred_ids  = np.where(pred_ids  != -100, pred_ids,  tok.pad_token_id)
    label_ids = np.where(label_ids != -100, label_ids, tok.pad_token_id)
    preds  = tok.batch_decode(pred_ids,  skip_special_tokens=True)
    labels = tok.batch_decode(label_ids, skip_special_tokens=True)
    return preds, labels

def summarize_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    preds, labels = _decode(pred_ids, label_ids)
    r = rouge.compute(predictions=preds, references=labels, use_stemmer=True)
    return {f"rouge_{k}": v for k,v in r.items()}

def classify_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    preds, labels = _decode(pred_ids, label_ids)
    preds_bin  = [1 if (p.strip() and p.strip()[0]=='1') else 0 for p in preds]
    labels_bin = [1 if (l.strip() and l.strip()[0]=='1') else 0 for l in labels]
    return {
        "accuracy":  acc.compute(predictions=preds_bin, references=labels_bin)["accuracy"],
        "precision": prec.compute(predictions=preds_bin, references=labels_bin)["precision"],
        "recall":    rec.compute(predictions=preds_bin, references=labels_bin)["recall"],
        "f1":        f1.compute(predictions=preds_bin, references=labels_bin)["f1"],
    }

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    output_dir="bart_multitask_lora",
    learning_rate=2e-4,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=32,
    gradient_accumulation_steps=4,

    fp16=True, bf16=False,            
    max_grad_norm=1.0,

    # evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    metric_for_best_model="eval_rougeLsum",
    greater_is_better=True,

    predict_with_generate=True,
    generation_num_beams=5,
    # generation_max_new_tokens=MAX_SUM,

    logging_steps=50,
    dataloader_pin_memory=True,
    gradient_checkpointing=True,
    # optim="adamw_bnb_8bit",
    optim="adamw_torch_fused",
    report_to="none",
    torch_compile=False,
    dataloader_num_workers=2,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_sum_tok,
    processing_class=tok,      # <- use this instead of tokenizer=tok
    data_collator=collator,
    compute_metrics=summarize_metrics
)

trainer.train()


Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Map:   0%|          | 0/15834 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]



Step,Training Loss
50,10.1161
100,6.163
150,3.0868
200,1.6482
250,1.3156
300,1.1796
350,1.1642
400,1.1442
450,1.129
500,1.1288




TrainOutput(global_step=744, training_loss=2.2419506349871234, metrics={'train_runtime': 7286.2738, 'train_samples_per_second': 6.519, 'train_steps_per_second': 0.102, 'total_flos': 4.117700503771546e+16, 'train_loss': 2.2419506349871234, 'epoch': 3.0})

In [3]:
from transformers.utils import logging
logging.set_verbosity_error()  

from transformers import GenerationConfig




sum_gc = GenerationConfig(
    do_sample=False, num_beams=5,
    max_new_tokens=MAX_SUM,
    length_penalty=1.1,         
    no_repeat_ngram_size=3,     
    min_new_tokens=16            
)


cls_gc = GenerationConfig(do_sample=False, num_beams=1, max_new_tokens=2)

trainer.compute_metrics = summarize_metrics

model.generation_config = sum_gc
print("Validation – Summarization (no-title):",
      trainer.evaluate(eval_dataset=val_sum_tok, metric_key_prefix="sum_val"))

print("Test – Summarization (no-title):",
      trainer.evaluate(eval_dataset=test_sum_tok, metric_key_prefix="sum_test"))

trainer.compute_metrics = classify_metrics
pred_val = trainer.predict(val_cls_tok,  metric_key_prefix="cls_val",  generation_config=cls_gc)
print("Validation – Classification (no-title):",
      classify_metrics((pred_val.predictions, pred_val.label_ids)))

pred_test = trainer.predict(test_cls_tok, metric_key_prefix="cls_test", generation_config=cls_gc)
print("Test – Classification (no-title):",
      classify_metrics((pred_test.predictions, pred_test.label_ids)))

import torch
@torch.no_grad()
def predict_summary_and_label(text: str):
    device = next(model.parameters()).device
    x = text.strip()

    in_sum = tok("summarize: " + x, return_tensors="pt",
                 truncation=True, max_length=MAX_SRC).to(device)
    out_sum = model.generate(**in_sum)
    summary = tok.decode(out_sum[0], skip_special_tokens=True).strip()

    in_cls = tok("classify_populism: " + x, return_tensors="pt",
                 truncation=True, max_length=MAX_SRC).to(device)
    out_cls = model.generate(**in_cls, max_new_tokens=2, do_sample=False)
    lab = tok.decode(out_cls[0], skip_special_tokens=True).strip()
    is_pop = 1 if lab.lstrip().startswith("1") else 0

    return summary, is_pop


trainer.save_model("final_adapter")  
tok.save_pretrained("final_adapter")

if USE_LORA:
    merged = model.merge_and_unload()
else:
    merged = model
merged.save_pretrained("final_merged", safe_serialization=True)
tok.save_pretrained("final_merged")




Validation – Summarization (no-title): {'sum_val_loss': 1.8238072395324707, 'sum_val_rouge_rouge1': 0.20916817359746792, 'sum_val_rouge_rouge2': 0.07701058107896677, 'sum_val_rouge_rougeL': 0.17252977570196587, 'sum_val_rouge_rougeLsum': 0.17254445260503692, 'sum_val_runtime': 2217.5431, 'sum_val_samples_per_second': 0.446, 'sum_val_steps_per_second': 0.028, 'epoch': 3.0}
Test – Summarization (no-title): {'sum_test_loss': 1.8614931106567383, 'sum_test_rouge_rouge1': 0.2075064122017521, 'sum_test_rouge_rouge2': 0.07582501786662132, 'sum_test_rouge_rougeL': 0.1708362966991224, 'sum_test_rouge_rougeLsum': 0.170730714017154, 'sum_test_runtime': 2158.1586, 'sum_test_samples_per_second': 0.459, 'sum_test_steps_per_second': 0.029, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Validation – Classification (no-title): {'accuracy': 0.5969696969696969, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test – Classification (no-title): {'accuracy': 0.5969696969696969, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}




('final_merged/tokenizer_config.json',
 'final_merged/special_tokens_map.json',
 'final_merged/vocab.json',
 'final_merged/merges.txt',
 'final_merged/added_tokens.json',
 'final_merged/tokenizer.json')

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

MODEL_DIR = "final_merged"
MAX_SRC = 500
MAX_SUM = 128
device = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_DIR,
    torch_dtype=(torch.float16 if torch.cuda.is_available() else torch.float32),
    low_cpu_mem_usage=True,
).to(device).eval()

try:
    model.generation_config = GenerationConfig.from_pretrained(MODEL_DIR)
except Exception:
    model.generation_config = GenerationConfig(do_sample=False, num_beams=1, max_new_tokens=MAX_SUM)

def _to_str(x):
    if isinstance(x, str):
        return x
    if isinstance(x, (list, tuple)):
        return " ".join(map(str, x))
    try:
        return str(x)
    except Exception:
        return " ".join(map(str, x))

# --- Quick test ---
text = "Labour's Séan Sherlock has called for Minister of State Pat Breen to clarify his meetings with David McCourt, a lead bidder on the National Broadband Plan. Allegations of undisclosed meetings, including private visits, have raised concerns about transparency and potential conflicts of interest. Labour demands Fine Gael provide full disclosure on these interactions.","Labour Dáil Communications spokesperson Séan Sherlock has called on Minister of State Pat Breen to make a statement to the Dáil outlining the purpose and content of his meetings with David McCourt following further reports of meetings in the Irish Mail on Sunday, and Sunday Business Post.Labour raised the possibility of Minister of State Breen making a Dáil statement at last Thursday's Business Committee meeting, and will pursue it further next week.Deputy Sherlock said: There is a drip-feed of revelations about the meetings between both the former Minister for Communications with David McCourt, but also the many meetings that have taken place with Minister of State Pat Breen. The Mail on Sunday yesterday revealed that Minister Breen met with Mr McCourt for breakfast before the bidder went on to meet the Minister for Communications in 2016. The Sunday Business Post also revealed there were three previously unreported meetings, with visits on several occasions to the bidder's home in a 'private capacity'. It is now necessary for the Minister of State to make a clear statement to the Dáil, outlining the purposes of all these meetings, and the topics of discussions. It is simply not believable that the National Broadband Plan would not have come up at all considering it is a major state contract that Mr McCourt is leading the bid on. It is time for Fine Gael to come clean on all the meetings that have taken place with the lead bidder on the tender. The Labour Party raised at Thursday's Business Committee the possibility of the Minister making a statement to the Dáil on the matter. Following Sunday's revelations, the Minister must now clarify the many meetings and the nature of the discussions at them.,Breen should address Dáil - FG must come clean on McCourt meetings"
print("text type:", type(text))  # sanity check
summary, is_pop = predict_summary_and_label(text)
print("SUMMARY:\n", summary)
print("IS_POPULIST:", is_pop)


text type: <class 'tuple'>
SUMMARY:
 Labour's Séan Sherlock calls for Minister of State Pat Breen to clarify his meetings
IS_POPULIST: 0
