In [None]:
# mount google colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# install packages
! pip install transformers datasets
! pip install sentencepiece
! pip install transformers[torch]
! pip install xformers

In [None]:
# login to huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# import dataset wich should be used for finetuning
from datasets import load_dataset

right_dataset = load_dataset('csv', data_files='add_path_to_dataset/allsides_data_right_heading.csv')
right_dataset = right_dataset['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# split data into trainings and test split
right_dataset = right_dataset.train_test_split(test_size=0.2)

## Preprocess

In [None]:
# import tokenizer for the pretrained model wich should be used
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [None]:
# function to apply tokenizing
def preprocess_function(examples):
    return tokenizer(examples["heading"]) #define column wich should be used for finetuning

In [None]:
# tokenize dataset
tokenized_right_dataset = right_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=right_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/6464 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1617 [00:00<?, ? examples/s]

In [None]:
# chunk size for concatenation
block_size = 32


# function to concatenate data
def group_texts(examples):
    # concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# apply concatenation to dataset
lm_dataset = tokenized_right_dataset.map(group_texts, batched=True, num_proc=4)
#lm_dataset['train'][0]

Map (num_proc=4):   0%|          | 0/6464 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1617 [00:00<?, ? examples/s]

In [None]:
# import the Trainer and Datacollector for finetuning purposes
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer, Seq2SeqTrainer, Seq2SeqTrainingArguments

# add pad token to tokenizer
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #load datacollector

## Train

In [None]:
# import the pretrained model wich should be finetuned
from transformers import PegasusForCausalLM

model = PegasusForCausalLM.from_pretrained("google/pegasus-large", add_cross_attention=False)
print(model.config.is_decoder, f"{model.__class__} has to be configured as a decoder.")

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForCausalLM were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

True <class 'transformers.models.pegasus.modeling_pegasus.PegasusForCausalLM'> has to be configured as a decoder.


In [None]:
#finetune the pretrained model on the dataset
training_args = TrainingArguments(
    output_dir="pegasus_right_heading_torch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=20
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
# get the perplexity for the model
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 12.43


In [None]:
# push finetuned model to huggingface repo
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/1.37G [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/4.06k [00:00<?, ?B/s]

Upload file runs/Jul27_15-10-10_61035d95a6f3/events.out.tfevents.1690470691.61035d95a6f3.3998.1:   0%|        …

Upload file runs/Jul27_15-10-10_61035d95a6f3/events.out.tfevents.1690470615.61035d95a6f3.3998.0:   0%|        …

To https://huggingface.co/tobijen/pegasus_right_heading_torch
   a7621a6..5c7fb66  main -> main

   a7621a6..5c7fb66  main -> main

To https://huggingface.co/tobijen/pegasus_right_heading_torch
   5c7fb66..0ae1a0f  main -> main

   5c7fb66..0ae1a0f  main -> main



'https://huggingface.co/tobijen/pegasus_right_heading_torch/commit/5c7fb661bb9f3f73beb09935c7fdecba5c014186'

In [None]:
# save model locally
trainer.save_model("path_to_where_the_model_should_be_saved_locally/pegasus_right_heading_torch")

Upload file runs/Jul28_14-35-43_208d398e2820/events.out.tfevents.1690555356.208d398e2820.1892.0: 100%|########…

Upload file runs/Jul28_14-35-43_208d398e2820/events.out.tfevents.1690555500.208d398e2820.1892.1: 100%|########…

To https://huggingface.co/tobijen/pegasus_right_heading_torch
   1bfd5f0..db3544a  main -> main

   1bfd5f0..db3544a  main -> main

To https://huggingface.co/tobijen/pegasus_right_heading_torch
   db3544a..5f7ebc2  main -> main

   db3544a..5f7ebc2  main -> main



## Inference

In [None]:
# Inputs to generate text for
prompts = {
    "social_1": "Gay marriage is",
    "social_2": "Abortion is",
    "education_1": "Public education is",
    "education_2": "Charter schools are",
    "economic_1":"The increase of taxes is",
    "economic_2":"Government regulations on businesses are",
}

In [None]:
from transformers import pipeline
# generate text with the pipeline module from the transformers library
generated_text_dict = {}
for key, prompt in prompts.items():
  print(key, " => ", prompt)
  generator = pipeline("text-generation", model="tobijen/pegasus_right_heading_torch", tokenizer=tokenizer)
  generated_text = generator(prompt, return_full_text=True, max_new_tokens=50)
  generated_text_dict[key] = generated_text[0]["generated_text"]
  print(generated_text)

social_1  =>  Gay marriage is


Downloading (…)lve/main/config.json:   0%|          | 0.00/3.13k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

[{'generated_text': 'Gay marriage is crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis'}]
social_2  =>  Abortion is
[{'generated_text': 'Abortion is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is'}]
education_1  =>  Public education is
[{'generated_text': 'Public education is to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to'}]
education_2  =>  Charter schools are
[{'generated_text': 'Charter schools are Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charte

In [None]:
generated_text_dict

{'social_1': 'Gay marriage is crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis crisis',
 'social_2': 'Abortion is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is',
 'education_1': 'Public education is to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to',
 'education_2': 'Charter schools are Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Charter Cha

In [None]:
# add the generated text to a json file, wich is used for the evaluation
import json
import os

def write_dict_to_json_file(file_path, data):
    # Check if the JSON file exists
    if os.path.exists(file_path):
        # If the file exists, load the existing data
        with open(file_path, 'r', encoding='utf-8') as file:
            existing_data = json.load(file)
    else:
        # If the file does not exist, create an empty dictionary
        existing_data = {}

    # Update the existing dictionary with the new data
    existing_data.update(data)

    # Write the updated dictionary back to the JSON file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False)

data_to_append = {
    "pegasus_right_heading": generated_text_dict
}

# write data to json file
write_dict_to_json_file('path_to_json_file/generated_texts.json', data_to_append)