In [1]:
import pandas as pd
df = pd.read_csv('Final_Populism.csv')

In [2]:
df

Unnamed: 0,Summary_Embeddings,Summary,Original_Text,Article_Title,Is_Populist
0,"[-0.029149754, -0.021904264, 0.0021603003, -0....",Donnchadh Ó Laoghaire of Sinn Féin criticized ...,"Sinn Féin spokesperson on Social Protection, D...",No sense in government’s all-in approach to EU...,0
1,"[0.019964693, 0.010486043, 0.027138552, 0.0468...","Donnchadh Ó Laoghaire, Sinn Féin's spokesperso...",Lack of space in our schools must be addressed...,Lack of space in our schools must be addressed...,0
2,"[-0.011785262, -0.005118389, -0.028711798, 0.0...","Donnchadh Ó Laoghaire, a Sinn Féin TD, urged t...",We must safeguard the future of League of Irel...,We must safeguard the future of League of Irel...,0
3,"[-0.046097253, -0.027319714, 0.007871007, -0.0...","Donáth Anna, a Momentum EP-képviselője, az Eur...","Szerző: Momentum Mozgalom | máj 31, 2023 | Mé...",Donáth Anna: Európa a megoldás a magyar oktatá...,0
4,"[-0.055969328, 0.08520521, 0.015029931, 0.0698...",Douglas Hoyos criticizes the Austrian governme...,"Douglas Hoyos: ""Eine Taskforce einzuführen ist...",Douglas Hoyos: Eine Taskforce einzuführen ist ...,0
...,...,...,...,...,...
9892,"[-0.057788845, 0.0046745464, -0.011104433, 0.0...",The text highlights growing protests against E...,Nu de Europese top van regeringsleiders op 14 ...,Veel protestacties rond Euro-Top,1
9893,"[-0.027253903, -0.0062221605, -0.051059105, 0....","The text highlights support for the PVV party,...",PVV +2 zetels!Nederlanders willen hun eigen la...,PVV +2 ZETELS!!,1
9894,"[-0.04918098, 0.00316158, -0.027780691, 0.0190...",The text highlights the Dutch government's mis...,Tienduizenden mensen zijn het slachtoffer gewo...,Aftreden kabinet maakt weg vrij voor een eerli...,1
9895,"[-0.054796062, 0.019674687, -0.033823974, 0.03...",The text highlights the ELAM party's oppositio...,Στις 8 Ιουλίου του 2018 ΔΗΣΥ και ΔΗΚΟ ψήφισαν ...,Άρχισαν οι ηλεκτρονικές εκποιήσεις – Το ΕΛΑΜ σ...,1


In [2]:
df = df[['Summary', 'Original_Text', 'Article_Title','Is_Populist']]

In [3]:

import os, random, numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model
import evaluate

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

MODEL_NAME = "facebook/bart-base"   
MAX_SRC = 500
MAX_SUM = 128
DROP_TITLE_PROB = 0.5  
USE_LORA = True


df['Is_Populist'] = df['Is_Populist'].astype(int).clip(0,1)

def make_prompt(title, text, may_drop=False):
    use_title = bool(title) and (not may_drop or random.random() > DROP_TITLE_PROB)
    prefix = (title.strip() + ". ") if use_title else ""
    return (prefix + (text or "")).strip()

def row_to_examples(row, may_drop=False):
    x_sum = "summarize: " + make_prompt(row['Article_Title'], row['Original_Text'], may_drop=may_drop)
    x_cls = "classify_populism: " + make_prompt(row['Article_Title'], row['Original_Text'], may_drop=may_drop)
    return [
        {"task":"summarize", "input": x_sum, "target": row["Summary"]},
        {"task":"classify",  "input": x_cls, "target": str(int(row["Is_Populist"]))}
    ]

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Is_Populist'])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['Is_Populist'])

train_examples = [ex for _, r in train_df.iterrows() for ex in row_to_examples(r, may_drop=True)]
val_examples   = [ex for _, r in val_df.iterrows()   for ex in row_to_examples(r, may_drop=False)]
test_examples  = [ex for _, r in test_df.iterrows()  for ex in row_to_examples(r, may_drop=False)]

train_ds = Dataset.from_list(train_examples)
val_ds   = Dataset.from_list(val_examples)
test_ds  = Dataset.from_list(test_examples)

val_sum  = val_ds.filter(lambda ex: ex["task"]=="summarize")
val_cls  = val_ds.filter(lambda ex: ex["task"]=="classify")
test_sum = test_ds.filter(lambda ex: ex["task"]=="summarize")
test_cls = test_ds.filter(lambda ex: ex["task"]=="classify")

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tok.padding_side = "right"  


tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tok.padding_side = "right"
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token  # safety (Bart usually already has pad=1)

def tokenize_fn(batch):
    ins = tok(
        batch["input"], truncation=True, max_length=MAX_SRC, padding="max_length",
    )
    labs = tok(
        text_target=batch["target"], truncation=True, max_length=MAX_SUM, padding="max_length",
    )
    ins["labels"] = labs["input_ids"]    # collator will convert pad -> -100
    return ins


train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
val_sum_tok  = val_sum.map(tokenize_fn, batched=True, remove_columns=val_sum.column_names)
val_cls_tok  = val_cls.map(tokenize_fn, batched=True, remove_columns=val_cls.column_names)
test_sum_tok = test_sum.map(tokenize_fn, batched=True, remove_columns=test_sum.column_names)
test_cls_tok = test_cls.map(tokenize_fn, batched=True, remove_columns=test_cls.column_names)


base = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,          # keep fp32 on T1000 for stability
    attn_implementation="eager"
).to("cuda" if torch.cuda.is_available() else "cpu")

model = get_peft_model(base, LoraConfig(task_type="SEQ_2_SEQ_LM", r=16, lora_alpha=32, lora_dropout=0.05)) if USE_LORA else base
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model)
    
    
rouge = evaluate.load("rouge")
acc   = evaluate.load("accuracy"); prec = evaluate.load("precision")
rec   = evaluate.load("recall");   f1   = evaluate.load("f1")

def _decode(pred_ids, label_ids):
    pred_ids  = np.where(pred_ids  != -100, pred_ids,  tok.pad_token_id)
    label_ids = np.where(label_ids != -100, label_ids, tok.pad_token_id)
    preds  = tok.batch_decode(pred_ids,  skip_special_tokens=True)
    labels = tok.batch_decode(label_ids, skip_special_tokens=True)
    return preds, labels

def summarize_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    preds, labels = _decode(pred_ids, label_ids)
    r = rouge.compute(predictions=preds, references=labels, use_stemmer=True)
    return {f"rouge_{k}": v for k,v in r.items()}

def classify_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    preds, labels = _decode(pred_ids, label_ids)
    preds_bin  = [1 if (p.strip() and p.strip()[0]=='1') else 0 for p in preds]
    labels_bin = [1 if (l.strip() and l.strip()[0]=='1') else 0 for l in labels]
    return {
        "accuracy":  acc.compute(predictions=preds_bin, references=labels_bin)["accuracy"],
        "precision": prec.compute(predictions=preds_bin, references=labels_bin)["precision"],
        "recall":    rec.compute(predictions=preds_bin, references=labels_bin)["recall"],
        "f1":        f1.compute(predictions=preds_bin, references=labels_bin)["f1"],
    }

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    output_dir="bart_multitask_lora",
    learning_rate=2e-4,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=32,
    gradient_accumulation_steps=4,

    fp16=False, bf16=False,             # stay in fp32 for stability first
    max_grad_norm=1.0,

    # evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    metric_for_best_model="eval_rougeLsum",
    greater_is_better=True,

    predict_with_generate=True,
    generation_num_beams=1,
    # generation_max_new_tokens=MAX_SUM,

    logging_steps=50,
    dataloader_pin_memory=True,
    gradient_checkpointing=False,
    optim="adamw_torch_fused",
    report_to="none",
    torch_compile=False,
    dataloader_num_workers=2,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_sum_tok,
    processing_class=tok,      # <- use this instead of tokenizer=tok
    data_collator=collator,
    compute_metrics=summarize_metrics
)

trainer.train()


Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1980 [00:00<?, ? examples/s]

Map:   0%|          | 0/15834 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

trainable params: 884,736 || all params: 140,305,152 || trainable%: 0.6306


Step,Training Loss
50,10.137
100,6.4134
150,5.1637
200,4.9775
250,4.9172
300,4.8911
350,4.8371
400,4.8155
450,4.7928


[train] bad_inputs: 0, bad_labels: 0
[val_sum] bad_inputs: 0, bad_labels: 0
[val_cls] bad_inputs: 0, bad_labels: 0
[test_sum] bad_inputs: 0, bad_labels: 0
[test_cls] bad_inputs: 0, bad_labels: 0


In [4]:
from transformers.utils import logging
logging.set_verbosity_error()  

from transformers import GenerationConfig

model.generation_config = GenerationConfig(
    do_sample=False,
    num_beams=1,
    max_new_tokens=MAX_SUM,
)
if hasattr(model.generation_config, "early_stopping"):
    model.generation_config.early_stopping = None

trainer.compute_metrics = summarize_metrics
print("Validation – Summarization:",
      trainer.evaluate(eval_dataset=val_sum_tok, metric_key_prefix="sum_val"))

trainer.compute_metrics = classify_metrics
print("Validation – Classification:",
      trainer.evaluate(eval_dataset=val_cls_tok, metric_key_prefix="cls_val"))

trainer.compute_metrics = summarize_metrics
print("Test – Summarization:",
      trainer.evaluate(eval_dataset=test_sum_tok, metric_key_prefix="sum_test"))

trainer.compute_metrics = classify_metrics
print("Test – Classification:",
      trainer.evaluate(eval_dataset=test_cls_tok, metric_key_prefix="cls_test"))

import torch
@torch.no_grad()
def predict_summary_and_label(text: str):
    device = next(model.parameters()).device
    x = text.strip()

    in_sum = tok("summarize: " + x, return_tensors="pt",
                 truncation=True, max_length=MAX_SRC).to(device)
    out_sum = model.generate(**in_sum)
    summary = tok.decode(out_sum[0], skip_special_tokens=True).strip()

    in_cls = tok("classify_populism: " + x, return_tensors="pt",
                 truncation=True, max_length=MAX_SRC).to(device)
    out_cls = model.generate(**in_cls, max_new_tokens=2, do_sample=False)
    lab = tok.decode(out_cls[0], skip_special_tokens=True).strip()
    is_pop = 1 if lab.lstrip().startswith("1") else 0

    return summary, is_pop


trainer.save_model("final_adapter")  
tok.save_pretrained("final_adapter")

if USE_LORA:
    merged = model.merge_and_unload()
else:
    merged = model
merged.save_pretrained("final_merged", safe_serialization=True)
tok.save_pretrained("final_merged")


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


one-batch loss: 4.670574188232422
grad present: True


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Validation – Summarization: {'sum_val_loss': 3.531428098678589, 'sum_val_rouge_rouge1': 0.2053850133484431, 'sum_val_rouge_rouge2': 0.07275916064360691, 'sum_val_rouge_rougeL': 0.16965470680176922, 'sum_val_rouge_rougeLsum': 0.16956278737760228, 'sum_val_runtime': 308.1543, 'sum_val_samples_per_second': 3.213, 'sum_val_steps_per_second': 3.213, 'epoch': 1.0}


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Validation – Classification: {'cls_val_loss': 4.799562931060791, 'cls_val_accuracy': 0.592929292929293, 'cls_val_precision': 0.375, 'cls_val_recall': 0.015037593984962405, 'cls_val_f1': 0.02891566265060241, 'cls_val_runtime': 217.3602, 'cls_val_samples_per_second': 4.555, 'cls_val_steps_per_second': 4.555, 'epoch': 1.0}


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Test – Summarization: {'sum_test_loss': 3.4697511196136475, 'sum_test_rouge_rouge1': 0.19782892962792112, 'sum_test_rouge_rouge2': 0.07077417391054694, 'sum_test_rouge_rougeL': 0.16338271928988854, 'sum_test_rouge_rougeLsum': 0.16331683595861513, 'sum_test_runtime': 321.6317, 'sum_test_samples_per_second': 3.078, 'sum_test_steps_per_second': 3.078, 'epoch': 1.0}


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Test – Classification: {'cls_test_loss': 4.799356937408447, 'cls_test_accuracy': 0.5909090909090909, 'cls_test_precision': 0.3125, 'cls_test_recall': 0.012531328320802004, 'cls_test_f1': 0.024096385542168676, 'cls_test_runtime': 221.5064, 'cls_test_samples_per_second': 4.469, 'cls_test_steps_per_second': 4.469, 'epoch': 1.0}




('final_merged/tokenizer_config.json',
 'final_merged/special_tokens_map.json',
 'final_merged/vocab.json',
 'final_merged/merges.txt',
 'final_merged/added_tokens.json',
 'final_merged/tokenizer.json')

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

MODEL_DIR = "final_merged"
MAX_SRC = 500
MAX_SUM = 128
device = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_DIR,
    torch_dtype=(torch.float16 if torch.cuda.is_available() else torch.float32),
    low_cpu_mem_usage=True,
).to(device).eval()

try:
    model.generation_config = GenerationConfig.from_pretrained(MODEL_DIR)
except Exception:
    model.generation_config = GenerationConfig(do_sample=False, num_beams=1, max_new_tokens=MAX_SUM)

def _to_str(x):
    if isinstance(x, str):
        return x
    if isinstance(x, (list, tuple)):
        return " ".join(map(str, x))
    try:
        return str(x)
    except Exception:
        return " ".join(map(str, x))

# --- Quick test ---
text = "Labour's Séan Sherlock has called for Minister of State Pat Breen to clarify his meetings with David McCourt, a lead bidder on the National Broadband Plan. Allegations of undisclosed meetings, including private visits, have raised concerns about transparency and potential conflicts of interest. Labour demands Fine Gael provide full disclosure on these interactions.","Labour Dáil Communications spokesperson Séan Sherlock has called on Minister of State Pat Breen to make a statement to the Dáil outlining the purpose and content of his meetings with David McCourt following further reports of meetings in the Irish Mail on Sunday, and Sunday Business Post.Labour raised the possibility of Minister of State Breen making a Dáil statement at last Thursday's Business Committee meeting, and will pursue it further next week.Deputy Sherlock said: There is a drip-feed of revelations about the meetings between both the former Minister for Communications with David McCourt, but also the many meetings that have taken place with Minister of State Pat Breen. The Mail on Sunday yesterday revealed that Minister Breen met with Mr McCourt for breakfast before the bidder went on to meet the Minister for Communications in 2016. The Sunday Business Post also revealed there were three previously unreported meetings, with visits on several occasions to the bidder's home in a 'private capacity'. It is now necessary for the Minister of State to make a clear statement to the Dáil, outlining the purposes of all these meetings, and the topics of discussions. It is simply not believable that the National Broadband Plan would not have come up at all considering it is a major state contract that Mr McCourt is leading the bid on. It is time for Fine Gael to come clean on all the meetings that have taken place with the lead bidder on the tender. The Labour Party raised at Thursday's Business Committee the possibility of the Minister making a statement to the Dáil on the matter. Following Sunday's revelations, the Minister must now clarify the many meetings and the nature of the discussions at them.,Breen should address Dáil - FG must come clean on McCourt meetings"
print("text type:", type(text))  # sanity check
summary, is_pop = predict_summary_and_label(text)
print("SUMMARY:\n", summary)
print("IS_POPULIST:", is_pop)


text type: <class 'tuple'>
SUMMARY:
 Labour's Séan Sherlock calls for Minister of State Pat Breen to clarify his meetings
IS_POPULIST: 0
