In [None]:
# mount google colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# install packages
!pip install transformers datasets evaluate
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

In [None]:
# login to huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# import dataset wich should be used for finetuning
from datasets import load_dataset

right_dataset = load_dataset('csv', data_files='add_path_to_dataset/allsides_data_right_heading.csv')
right_dataset = right_dataset['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# split data into trainings and test split
right_dataset = right_dataset.train_test_split(test_size=0.2)

# Preprocess

In [None]:
# import tokenizer for the pretrained model wich should be used
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# function to apply tokenizing
def preprocess_function(examples):
    return tokenizer(examples["heading"]) #define column wich should be used for finetuning

In [None]:
# tokenize dataset
tokenized_right_dataset = right_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=right_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/6464 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1617 [00:00<?, ? examples/s]

In [None]:
# chunk size for concatenation
block_size = 128

# function to concatenate data
def group_texts(examples):
    # concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# apply concatenation to dataset
lm_dataset = tokenized_right_dataset.map(group_texts, batched=True, num_proc=4)
#lm_dataset['train'][0]

Map (num_proc=4):   0%|          | 0/6464 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1617 [00:00<?, ? examples/s]

In [None]:
# import the Trainer and Datacollector for finetuning purposes
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

# add pad token to tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #load datacollector

# Train

In [None]:
# import the pretrained model wich should be finetuned
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("bert-base-uncased", is_decoder = True)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
#finetune the pretrained model on the dataset

training_args = TrainingArguments(
    output_dir="bert_right_heading_torch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

Cloning https://huggingface.co/tobijen/bert_right_heading_torch into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.4k/418M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Download file runs/Jul25_14-50-28_b74458e43bdb/events.out.tfevents.1690296868.b74458e43bdb.547.3: 100%|#######…

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Clean file runs/Jul25_14-50-28_b74458e43bdb/events.out.tfevents.1690296868.b74458e43bdb.547.3: 100%|##########…

Download file runs/Jul25_14-50-28_b74458e43bdb/events.out.tfevents.1690296633.b74458e43bdb.547.2: 100%|#######…

Clean file runs/Jul25_14-50-28_b74458e43bdb/events.out.tfevents.1690296633.b74458e43bdb.547.2:  13%|#2        …

Clean file pytorch_model.bin:   0%|          | 1.00k/418M [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,5.57929
2,No log,5.102203
3,No log,4.904901
4,No log,4.806831
5,5.327600,4.752375
6,5.327600,4.714924
7,5.327600,4.696406
8,5.327600,4.678969
9,5.327600,4.672903
10,4.301000,4.673214


TrainOutput(global_step=1050, training_loss=4.784350324358259, metrics={'train_runtime': 156.5356, 'train_samples_per_second': 53.534, 'train_steps_per_second': 6.708, 'total_flos': 551414082816000.0, 'train_loss': 4.784350324358259, 'epoch': 10.0})

In [None]:
# get the perplexity for the model
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 107.04


In [None]:
# push finetuned model to huggingface repo
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/418M [00:00<?, ?B/s]

Upload file runs/Jul25_14-50-28_b74458e43bdb/events.out.tfevents.1690296633.b74458e43bdb.547.2:   0%|         …

Upload file runs/Jul25_14-50-28_b74458e43bdb/events.out.tfevents.1690296868.b74458e43bdb.547.3:   0%|         …

To https://huggingface.co/tobijen/bert_right_heading_torch
   cea9f47..45c235f  main -> main

   cea9f47..45c235f  main -> main

To https://huggingface.co/tobijen/bert_right_heading_torch
   45c235f..b6a393f  main -> main

   45c235f..b6a393f  main -> main



'https://huggingface.co/tobijen/bert_right_heading_torch/commit/45c235f836e86805924f36b2e2e289bf7dfd23df'

In [None]:
# save model locally
trainer.save_model("path_to_where_the_model_should_be_saved_locally/bert_right_heading_torch")

# Inference

In [None]:
# Inputs to generate text for
prompts = {
    "social_1": "Gay marriage is",
    "social_2": "Abortion is",
    "education_1": "Public education is",
    "education_2": "Charter schools are",
    "economic_1":"The increase of taxes is",
    "economic_2":"Government regulations on businesses are",
}

In [None]:
from transformers import pipeline
# generate text with the pipeline module from the transformers library
generated_text_dict = {}
for key, prompt in prompts.items():
  print(key, " => ", prompt)
  generator = pipeline("text-generation", model="tobijen/bert_right_heading_torch", tokenizer=tokenizer)
  generated_text = generator(prompt, return_full_text=True, max_new_tokens=100)
  generated_text_dict[key] = generated_text[0]["generated_text"]
  print(generated_text)

social_1  =>  Gay marriage is
[{'generated_text': "Gay marriage is a threat to the state trump :'we will not be a party'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he '"}]
social_2  =>  Abortion is
[{'generated_text': "Abortion is'''''s'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' s'''s'''s '"}]
education_1  =>  Public education is
[{'generated_text': "Public education is a'''s'' s'' s'' s'''s'' s'''' s'''s'' s'' s'''s'' s'''s'''''s'''s'''''' s'' s'''''''s'' s'''s'' s'' s'' s'' s'' s'' s"}]
education_2  =>  Charter schools are
[{'generated_text': "Charter schools are'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' s'''s'''s'''s'''s'''s'''s'''s'' s'' s'''s '"}]
economic_1  =>  The increase of taxes is
[{'generated_text': "The increase of taxes is a big step trump says h

In [None]:
generated_text_dict

{'social_1': "Gay marriage is a threat to the state trump :'we will not be a party'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he's not a racist'trump says he '",
 'social_2': "Abortion is'''''s'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' s'''s'''s '",
 'education_1': "Public education is a'''s'' s'' s'' s'''s'' s'''' s'''s'' s'' s'''s'' s'''s'''''s'''s'''''' s'' s'''''''s'' s'''s'' s'' s'' s'' s'' s'' s",
 'education_2': "Charter schools are'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' s'''s'''s'''s'''s'''s'''s'''s'' s'' s'''s '",
 'economic_1': "The increase of taxes is a big step trump says he will not seek to resign as president trump says he will not seek presidential nomination trump says he's not a bad person'trump says he's not a bad person'trump says he's not a good person'trump says 

In [None]:
# add the generated text to a json file, wich is used for the evaluation


import json
import os

def write_dict_to_json_file(file_path, data):
    # Check if the JSON file exists
    if os.path.exists(file_path):
        # If the file exists, load the existing data
        with open(file_path, 'r', encoding='utf-8') as file:
            existing_data = json.load(file)
    else:
        # If the file does not exist, create an empty dictionary
        existing_data = {}

    # Update the existing dictionary with the new data
    existing_data.update(data)

    # Write the updated dictionary back to the JSON file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False)

data_to_append = {
    "bert_right_heading": generated_text_dict
}

# write data to json file
write_dict_to_json_file('path_to_json_file/generated_texts.json', data_to_append)