In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
!pip install -U transformers
import transformers
print("Transformers version:", transformers.__version__)

Transformers version: 4.52.4


In [None]:
!pip install -q -U transformers datasets sentencepiece accelerate


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
import torch
from google.colab import files


In [None]:
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv(filename)

# Validate CSV structure
assert "input" in df.columns and "output" in df.columns, "CSV must contain 'input' and 'output' columns"
print(df.head())


Saving risk_data_formatted.csv to risk_data_formatted (5).csv
                                               input  \
0  Data from 35 million traffic stops show that t...   
1  Around 1 in 127 people globally is on the auti...   
2  Data from more than 90,000 nurses studied over...   
3  Data from more than 90,000 nurses studied over...   
4  An estimated 1.7 billion people, 22% of the wo...   

                                              output  
0  Risk communication: 1\nAbsolute risk (base cas...  
1  Risk communication: 1\nAbsolute risk (base cas...  
2  Risk communication: 1\nAbsolute risk (base cas...  
3  Risk communication: 1\nAbsolute risk (base cas...  
4  Risk communication: 1\nAbsolute risk (base cas...  


In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


In [None]:
model_name = "t5-small"  # You can change to "google/flan-t5-small" or "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
max_input_length = 512
max_target_length = 512

def preprocess(example):
    model_input = tokenizer(
        text=str(example["input"]),
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )
    labels = tokenizer(
        text_target=str(example["output"]),
        padding="max_length",
        truncation=True,
        max_length=max_target_length
    )
    model_input["labels"] = labels["input_ids"]
    return model_input

train_dataset = train_dataset.map(preprocess, batched=False)
val_dataset = val_dataset.map(preprocess, batched=False)


Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./riskcomm_t5",
    do_eval=True,
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [None]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.8679,0.378216
2,0.3092,0.192464
3,0.2427,0.159746
4,0.1864,0.143881
5,0.145,0.129401
6,0.1381,0.126129
7,0.1448,0.1208
8,0.1294,0.113872
9,0.1446,0.114374
10,0.1294,0.112683


TrainOutput(global_step=380, training_loss=0.3797125192064988, metrics={'train_runtime': 159.3703, 'train_samples_per_second': 9.475, 'train_steps_per_second': 2.384, 'total_flos': 204366120222720.0, 'train_loss': 0.3797125192064988, 'epoch': 10.0})

In [None]:
def predict(text):
    input_ids = tokenizer(text, return_tensors="pt", truncation=True).input_ids

    # Get the device of the model
    device = model.device

    # Move the input_ids tensor to the same device as the model
    input_ids = input_ids.to(device)

    output_ids = model.generate(input_ids, max_length=512)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
def predict(text):
    input_ids = tokenizer(text, return_tensors="pt", truncation=True).input_ids

    # Get the device of the model
    device = model.device

    # Move the input_ids tensor to the same device as the model
    input_ids = input_ids.to(device)

    output_ids = model.generate(input_ids, max_length=512)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example usage
example_text = "Living in a city with high air pollution increases your risk of lung disease by 10%.\
While that may seem minor, it translates to an increase from 30 in 1,000 to 33 in\
1,000 people developing chronic respiratory issues over a decade."
print(predict(example_text))
print(predict(example_text))

Risk communication: 1 Absolute risk (base case): 33.1 Absolute risk (new case): null Absolute number (base case): null Absolute number (new case): null Absolute risk difference: null Relative risk difference: null Absolute number difference: null Verbal risk descriptor (base case): null Verbal risk descriptor (new situation): null Verbal risk descriptor (change from base to new): null Reference class size (base case: absolute number): null Reference class size (new case: absolute number): null Reference class description (base case): people living in a city with high air pollution Reference class description (new case): people living in a city with high air pollution Source (base case): null Source (new situation): null Topic and unit: Risk of lung disease by air pollution in %
Risk communication: 1 Absolute risk (base case): 33.1 Absolute risk (new case): null Absolute number (base case): null Absolute number (new case): null Absolute risk difference: null Relative risk difference: nu

In [None]:
trainer.save_model("./riskcomm_t5_model")
tokenizer.save_pretrained("./riskcomm_t5_model")


('./riskcomm_t5_model/tokenizer_config.json',
 './riskcomm_t5_model/special_tokens_map.json',
 './riskcomm_t5_model/spiece.model',
 './riskcomm_t5_model/added_tokens.json')