## CLIMATEBERT: A Pretrained Language Model for Climate-Related Text
#### by Nicolas Webersinke, Mathias Kraus, Julia Anna Bingler, and Markus Leippold
#### Link to paper: [arxiv.org/abs/2110.12010](https://arxiv.org/abs/2110.12010)
#### Code Part 2: Language model training

Import libraries and empty GPU cache (if applicable)

In [1]:
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForPreTraining
from transformers import DataCollatorForLanguageModeling

from datasets import load_dataset

import torch
# torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


## Climate-FEVER data prep (auto-inserted)
使用 Hugging Face 数据集 `tdiggelm/climate_fever`（该仓库仅提供 `test` 分片），
本单元会自动打乱并划分为 85%/15% 的 `train/validation`，并写出到：
- `corpus/train_corpus.txt`
- `corpus/val_corpus.txt`
同时会生成一个 `fine_tuning_texts.txt` 供可能的后续单元使用。


In [2]:
# === Prepare train/val files from Hugging Face `tdiggelm/climate_fever` ===
from datasets import load_dataset
from pathlib import Path
import random

# 1) Load dataset (only 'test' split available)
ds = load_dataset("tdiggelm/climate_fever", split="test")

# 2) Convert each example to a paragraph: Claim + Evidences + Label
def to_paragraph(ex):
    label = ex.get("claim_label", "")
    evs = ex.get("evidences", [])
    parts = []
    for it in (evs if isinstance(evs, list) else [evs]):
        if isinstance(it, dict):
            txt = it.get("evidence") or it.get("evidence_text") or it.get("sentence") or ""
        else:
            txt = str(it)
        if txt:
            parts.append(txt)
    ev_text = " ".join(parts)
    return "Claim: " + str(ex.get("claim","")) + "\nEvidence: " + ev_text + "\nLabel: " + str(label)

lines = [to_paragraph(ex) for ex in ds]

# 3) Shuffle & split 85/15
random.seed(42)
random.shuffle(lines)
cut = int(len(lines) * 0.85)
train_lines, val_lines = lines[:cut], lines[cut:]

# 4) Write files for downstream cells
out_dir = Path("corpus")
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "train_corpus.txt").write_text("\n".join(train_lines) + "\n", encoding="utf-8")
(out_dir / "val_corpus.txt").write_text("\n".join(val_lines) + "\n", encoding="utf-8")

# 5) Provide a generic fine_tuning_texts.txt if later cells need it
Path("fine_tuning_texts.txt").write_text("\n".join(lines) + "\n", encoding="utf-8")

print(f"Prepared Climate-FEVER: total={len(lines)}, train={len(train_lines)}, val={len(val_lines)}")


Prepared Climate-FEVER: total=1535, train=1304, val=231


Load dataset via Hugging Face datasets

In [3]:
datasets = load_dataset("text", data_files={"train": 'corpus/train_corpus.txt',         # Path to txt file with training corpus (selected or not)
                                            "validation": 'corpus/val_corpus.txt'})     # Path to txt file with validation corpus

Generating train split: 3912 examples [00:00, 135720.93 examples/s]
Generating validation split: 693 examples [00:00, 60406.76 examples/s]


Print size of dataset

In [4]:
print(len(datasets['train']))
print(len(datasets['validation']))

3912
693


Load the language model and the tokenizer from the augmentation

In [5]:
card = "model/distilroberta-base-augmented"
tokenizer = AutoTokenizer.from_pretrained(card, use_fast=True)
model = AutoModelForPreTraining.from_pretrained(card)

Make sure the model is resized correctly

In [6]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 768, padding_idx=1)

Define tokenize function

In [7]:
def tokenize_function(samples):
    return tokenizer(samples["text"], truncation=True)

Perform tokenization

In [8]:
from transformers import AutoTokenizer

# 选你的模型（例如 ClimateBERT 不可用就回退 roberta-base）
card = "roberta-base"  # 或 "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(card, use_fast=True)

def tokenize_function(batch, tokenizer):
    # 你的文本列名若不是 "text"，改成对应的列
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4,                 # 适当并行；Windows 下别太大
    remove_columns=["text"],    # 保留 tokenizer 输出
    fn_kwargs={"tokenizer": tokenizer},  # 关键：把 tokenizer 传进子进程
)


Map (num_proc=4): 100%|██████████| 3912/3912 [00:08<00:00, 440.61 examples/s]
Map (num_proc=4): 100%|██████████| 693/693 [00:08<00:00, 80.94 examples/s]


Init data collator for masked language modeling

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Define training args

In [10]:
from transformers.training_args import TrainingArguments

# 1) 先用最小参数初始化（只要 output_dir 必填），其余常用参数也可以放在这里
training_args = TrainingArguments(
    output_dir="model/xyz",
    overwrite_output_dir=False,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=False,                  # 没有兼容GPU就 False
    dataloader_num_workers=0,    # Windows 建议 0~2，先 0 更稳
    report_to="none",
    logging_steps=50,
)

# 2) 用“属性赋值”的方式设置评估/保存策略（必须一致，且不能是 "no"）
training_args.evaluation_strategy = "epoch"   # 等价于 IntervalStrategy.EPOCH
training_args.save_strategy        = "epoch"

# 3) 最优模型相关
training_args.load_best_model_at_end = True
training_args.metric_for_best_model  = "eval_loss"  # 注意是 eval_loss
training_args.greater_is_better      = False

print("OK: TrainingArguments created with",
      training_args.evaluation_strategy, training_args.save_strategy)




OK: TrainingArguments created with epoch epoch


Init trainer

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


Start training and evaluate/save (optional)

In [12]:
trainer.train()



Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("model/xyz")

In [None]:
import transformers, inspect, sys

# 强制从官方模块路径导入（不要用 from transformers import TrainingArguments 这种“顶层别名”）
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import IntervalStrategy   # IntervalStrategy 的权威位置

print("transformers =", transformers.__version__)
print("TrainingArguments from module:", TrainingArguments.__module__)
print("TrainingArguments file:", sys.modules[TrainingArguments.__module__].__file__)
print("has evaluation_strategy:",
      "evaluation_strategy" in inspect.signature(TrainingArguments.__init__).parameters)
print("TrainingArguments object:", TrainingArguments)
print("module:", TrainingArguments.__module__)
import importlib, sys
mod = sys.modules.get(TrainingArguments.__module__)
print("module file:", getattr(mod, "__file__", None))


