In [None]:
# mount google colab

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# install packages

!pip install transformers datasets evaluate
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

In [None]:
# login to huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# import dataset wich should be used for finetuning
from datasets import load_dataset

left_dataset = load_dataset('csv', data_files='add_path_to_dataset/allsides_data_left_heading.csv')
left_dataset = left_dataset['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# split data into trainings and test split
left_dataset = left_dataset.train_test_split(test_size=0.2)

# Preprocess

In [None]:
# import tokenizer for the pretrained model wich should be used
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# function to apply tokenizing
def preprocess_function(examples):
    return tokenizer(examples["heading"]) #define column wich should be used for finetuning

In [None]:
# tokenize dataset
tokenized_left_dataset = left_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=left_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/9156 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2290 [00:00<?, ? examples/s]

In [1]:
# chunk size for concatenation
block_size = 128

# function to concatenate data
def group_texts(examples):
    # concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# apply concatenation to dataset
lm_dataset = tokenized_left_dataset.map(group_texts, batched=True, num_proc=4)
#lm_dataset['train'][0]

In [None]:
# import the Trainer and Datacollector for finetuning purposes
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

# add pad token to tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #load datacollector

# Train

In [None]:
# import the pretrained model wich should be finetuned
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("bert-base-uncased", is_decoder = True)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
#finetune the pretrained model on the dataset
training_args = TrainingArguments(
    output_dir="bert_left_heading_torch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
# get the perplexity for the model
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 84.02


In [None]:
# push finetuned model to huggingface repo
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/418M [00:00<?, ?B/s]

Upload file runs/Jul26_16-52-41_9507d0f13a76/events.out.tfevents.1690390487.9507d0f13a76.506.0:   0%|         …

Upload file runs/Jul26_16-52-41_9507d0f13a76/events.out.tfevents.1690390732.9507d0f13a76.506.1:   0%|         …

To https://huggingface.co/tobijen/bert_left_heading_torch
   5407769..d72180d  main -> main

   5407769..d72180d  main -> main

To https://huggingface.co/tobijen/bert_left_heading_torch
   d72180d..dc35e55  main -> main

   d72180d..dc35e55  main -> main



'https://huggingface.co/tobijen/bert_left_heading_torch/commit/d72180dd94c88b49222bb7f3deac3ef42679e2db'

In [None]:
# save model locally
trainer.save_model("path_to_where_the_model_should_be_saved_locally/bert_left_heading_torch")

# Inference

In [None]:
# Inputs to generate text for
prompts = {
    "social_1": "Gay marriage is",
    "social_2": "Abortion is",
    "education_1": "Public education is",
    "education_2": "Charter schools are",
    "economic_1":"The increase of taxes is",
    "economic_2":"Government regulations on businesses are",
}

In [None]:
from transformers import pipeline
# generate text with the pipeline module from the transformers library
generated_text_dict = {}
for key, prompt in prompts.items():
  print(key, " => ", prompt)
  generator = pipeline("text-generation", model="tobijen/bert_left_heading_torch", tokenizer=tokenizer)
  generated_text = generator(prompt, return_full_text=True, max_new_tokens=100)
  generated_text_dict[key] = generated_text[0]["generated_text"]
  print(generated_text)

social_1  =>  Gay marriage is


Downloading (…)lve/main/config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'generated_text': "Gay marriage is a problem trump's'no - nonsense'' speech :'we're not a'' trump's'biggest secret'to watch for the next election trump's'no - nonsense'' speech :'i don ’ t know'trump says he's not ready to take over trump's presidency trump ’ s'' no, no, no no, not not. trump ’ s new york times :"}]
social_2  =>  Abortion is
[{'generated_text': "Abortion is the right - most - likely person to be elected president trump's'no - nonsense'' speech :'i'm not a'man'trump's'no - nonsense'' speech : trump's'best defense'trump's'no - nonsense'' speech : trump's'very wrong'speech trump's'no - nonsense'' speech trump's'no - nonsense'' speech is a"}]
education_1  =>  Public education is
[{'generated_text': "Public education is a major hurdle trump's'no - nonsense'' speech trump's'no - nonsense'' speech trump's'no - nonsense'' speech :'i'm not a racist'trump's'no - nonsense'' speech : trump's'' no - nonsense'' speech trump's'no - nonsense'' speech is a'' no - nonsense'speech trum

In [None]:
generated_text_dict

{'social_1': "Gay marriage is a problem trump's'no - nonsense'' speech :'we're not a'' trump's'biggest secret'to watch for the next election trump's'no - nonsense'' speech :'i don ’ t know'trump says he's not ready to take over trump's presidency trump ’ s'' no, no, no no, not not. trump ’ s new york times :",
 'social_2': "Abortion is the right - most - likely person to be elected president trump's'no - nonsense'' speech :'i'm not a'man'trump's'no - nonsense'' speech : trump's'best defense'trump's'no - nonsense'' speech : trump's'very wrong'speech trump's'no - nonsense'' speech trump's'no - nonsense'' speech is a",
 'education_1': "Public education is a major hurdle trump's'no - nonsense'' speech trump's'no - nonsense'' speech trump's'no - nonsense'' speech :'i'm not a racist'trump's'no - nonsense'' speech : trump's'' no - nonsense'' speech trump's'no - nonsense'' speech is a'' no - nonsense'speech trump's'no",
 'education_2': "Charter schools are not ready to be a'''' trump's first o

In [None]:
# add the generated text to a json file, wich is used for the evaluation
import json
import os

def write_dict_to_json_file(file_path, data):
    # Check if the JSON file exists
    if os.path.exists(file_path):
        # If the file exists, load the existing data
        with open(file_path, 'r', encoding='utf-8') as file:
            existing_data = json.load(file)
    else:
        # If the file does not exist, create an empty dictionary
        existing_data = {}

    # Update the existing dictionary with the new data
    existing_data.update(data)

    # Write the updated dictionary back to the JSON file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False)

data_to_append = {
    "bert_left_heading": generated_text_dict
}

# write data to json file
write_dict_to_json_file('path_to_json_file/generated_texts.json', data_to_append)