In [None]:
# mount google colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# install packages
!pip install transformers datasets evaluate
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
# login to huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# import dataset wich should be used for finetuning
from datasets import load_dataset

left_heading_dataset = load_dataset('csv', data_files='add_path_to_dataset/allsides_data_left_heading.csv')
left_heading_dataset = left_heading_dataset['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# split data into trainings and test split
left_heading_dataset = left_heading_dataset.train_test_split(test_size=0.2)

# Preprocess

In [None]:
# import tokenizer for the pretrained model wich should be used
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [None]:
# function to apply tokenizing
def preprocess_function(examples):
    return tokenizer(examples["heading"]) #define column wich should be used for finetuning

In [None]:
# tokenize dataset
tokenized_left_heading_dataset = left_heading_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=left_heading_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/9156 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2290 [00:00<?, ? examples/s]

In [None]:
# chunk size for concatenation
block_size = 128

# function to concatenate data
def group_texts(examples):
    # concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# apply concatenation to dataset
lm_dataset = tokenized_left_heading_dataset.map(group_texts, batched=True, num_proc=4)
#lm_dataset['train'][0]

In [None]:
# import the Trainer and Datacollector for finetuning purposes
from transformers import DataCollatorForLanguageModeling

# add pad token to tokenizer
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #load datacollector

# Train


In [None]:
# import the pretrained model wich should be finetuned
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
#finetune the pretrained model on the dataset

training_args = TrainingArguments(
    output_dir="distillgpt2_left_headings_torch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

Cloning https://huggingface.co/tobijen/distillgpt2_left_headings_torch into local empty directory.


Download file pytorch_model.bin:   0%|          | 17.0k/312M [00:00<?, ?B/s]

Download file runs/Jul25_11-19-48_4488ff49d703/events.out.tfevents.1690284074.4488ff49d703.230.0: 100%|#######…

Download file runs/Jul23_11-47-38_9af773529953/events.out.tfevents.1690112863.9af773529953.4438.0: 100%|######…

Download file runs/Jul25_11-27-39_4488ff49d703/events.out.tfevents.1690284462.4488ff49d703.230.2: 100%|#######…

Download file runs/Jul25_11-35-38_4488ff49d703/events.out.tfevents.1690284941.4488ff49d703.230.4: 100%|#######…

Download file runs/Jul24_16-43-34_cc22a5ddb863/events.out.tfevents.1690217099.cc22a5ddb863.155.0: 100%|#######…

Clean file runs/Jul25_11-19-48_4488ff49d703/events.out.tfevents.1690284074.4488ff49d703.230.0:  16%|#5        …

Clean file runs/Jul23_11-47-38_9af773529953/events.out.tfevents.1690112863.9af773529953.4438.0:  16%|#5       …

Download file runs/Jul23_20-53-15_2991a2587f1f/events.out.tfevents.1690145708.2991a2587f1f.173.0: 100%|#######…

Clean file runs/Jul25_11-27-39_4488ff49d703/events.out.tfevents.1690284462.4488ff49d703.230.2:  12%|#2        …

Clean file runs/Jul25_11-35-38_4488ff49d703/events.out.tfevents.1690284941.4488ff49d703.230.4:  13%|#3        …

Clean file runs/Jul24_16-43-34_cc22a5ddb863/events.out.tfevents.1690217099.cc22a5ddb863.155.0:  15%|#4        …

Clean file runs/Jul23_20-53-15_2991a2587f1f/events.out.tfevents.1690145708.2991a2587f1f.173.0:  19%|#9        …

Download file runs/Jul24_14-10-57_b1f0e5f073ec/events.out.tfevents.1690207989.b1f0e5f073ec.390.0: 100%|#######…

Clean file runs/Jul24_14-10-57_b1f0e5f073ec/events.out.tfevents.1690207989.b1f0e5f073ec.390.0:  16%|#6        …

Download file runs/Jul24_16-43-34_cc22a5ddb863/events.out.tfevents.1690226215.cc22a5ddb863.155.1: 100%|#######…

Download file runs/Jul25_10-16-24_5ce49ead0f25/events.out.tfevents.1690280291.5ce49ead0f25.696.0: 100%|#######…

Clean file runs/Jul24_16-43-34_cc22a5ddb863/events.out.tfevents.1690226215.cc22a5ddb863.155.1: 100%|##########…

Clean file runs/Jul25_10-16-24_5ce49ead0f25/events.out.tfevents.1690280291.5ce49ead0f25.696.0:  21%|##        …

Download file runs/Jul25_11-27-39_4488ff49d703/events.out.tfevents.1690284901.4488ff49d703.230.3: 100%|#######…

Clean file runs/Jul25_11-27-39_4488ff49d703/events.out.tfevents.1690284901.4488ff49d703.230.3: 100%|##########…

Download file runs/Jul23_11-47-38_9af773529953/events.out.tfevents.1690123305.9af773529953.4438.1: 100%|######…

Clean file runs/Jul23_11-47-38_9af773529953/events.out.tfevents.1690123305.9af773529953.4438.1: 100%|#########…

Download file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Download file runs/Jul25_11-19-48_4488ff49d703/events.out.tfevents.1690284130.4488ff49d703.230.1: 100%|#######…

Clean file runs/Jul25_11-19-48_4488ff49d703/events.out.tfevents.1690284130.4488ff49d703.230.1:  71%|#######   …

Download file runs/Jul25_11-35-38_4488ff49d703/events.out.tfevents.1690285096.4488ff49d703.230.5: 100%|#######…

Clean file runs/Jul25_11-35-38_4488ff49d703/events.out.tfevents.1690285096.4488ff49d703.230.5: 100%|##########…

Clean file pytorch_model.bin:   0%|          | 1.00k/312M [00:00<?, ?B/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,4.744651
2,No log,4.638951
3,No log,4.590534
4,4.721600,4.571173
5,4.721600,4.557534
6,4.721600,4.553393
7,4.721600,4.548618
8,4.377500,4.547396


TrainOutput(global_step=1016, training_loss=4.546042727673147, metrics={'train_runtime': 76.8695, 'train_samples_per_second': 105.113, 'train_steps_per_second': 13.217, 'total_flos': 263909718097920.0, 'train_loss': 4.546042727673147, 'epoch': 8.0})

In [None]:
# get the perplexity for the model
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

94.38627835494977

In [None]:
# push finetuned model to huggingface repo
trainer.push_to_hub()

In [None]:
# save model locally
trainer.save_model("path_to_where_the_model_should_be_saved_locally/distillgpt2_left_headings_torch")

# Inference

In [None]:
# Inputs to generate text for
prompts = {
    "social_1": "Gay marriage is",
    "social_2": "Abortion is",
    "education_1": "Public education is",
    "education_2": "Charter schools are",
    "economic_1":"The increase of taxes is",
    "economic_2":"Government regulations on businesses are",
}

In [None]:
# generate text with the pipeline module from the transformers library
from transformers import pipeline

generated_text_dict = {}
for key, prompt in prompts.items():
  print(key, " => ", prompt)
  generator = pipeline("text-generation", model="tobijen/distillgpt2_left_headings_torch", tokenizer=tokenizer)
  generated_text = generator(prompt)
  generated_text_dict[key] = generated_text[0]["generated_text"]
  print(generated_text)


social_1  =>  Gay marriage is


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Gay marriage is a social test,” he says.President Trump is poised to launch another crackdown on anti-Trump ralliesTrump says China’s ‘cures a massive threat’ on U.S. businessesFDA to give'}]
social_2  =>  Abortion is


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Abortion is not something that would be tolerated. But it will be a terrible problem for everyoneOPINION: Romney to announce plans for a new administration over Obamacare: 'What this means'Trump's White House: ‘The'most evil"}]
education_1  =>  Public education is


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Public education is helping the poor,” the American public warns against itAll of these events will impact our lives on the gridThe coronavirus pandemic will end with widespread hospitalizations across the countryTrump Calls on Congress to Help Fund Doctors:'}]
education_2  =>  Charter schools are


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Charter schools are no longer underfundedA new survey finds that some businesses face growing debt ceiling pressure after a massive debt default in OctoberObama to block new money for emergency workers in nursing homes, schoolsWhite House agrees to stop sending $100 million in'}]
economic_1  =>  The increase of taxes is


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The increase of taxes is a big economic boost: U.S. Jobless Rate Hits 3.5% in July. That’s a bad start, but it helps drive jobless numbers to the verge of an end.As many Americans'}]
economic_2  =>  Government regulations on businesses are


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Government regulations on businesses are taking aim at Russian banksBiden announces New G-8 summit in Switzerland ahead of Trump's visitThe National Rifle Association is poised to launch its tax cut effortThis is just the beginning: Republicans get another chance to win the"}]


In [None]:
generated_text_dict

{'social_1': 'Gay marriage is a social test,” he says.President Trump is poised to launch another crackdown on anti-Trump ralliesTrump says China’s ‘cures a massive threat’ on U.S. businessesFDA to give',
 'social_2': "Abortion is not something that would be tolerated. But it will be a terrible problem for everyoneOPINION: Romney to announce plans for a new administration over Obamacare: 'What this means'Trump's White House: ‘The'most evil",
 'education_1': 'Public education is helping the poor,” the American public warns against itAll of these events will impact our lives on the gridThe coronavirus pandemic will end with widespread hospitalizations across the countryTrump Calls on Congress to Help Fund Doctors:',
 'education_2': 'Charter schools are no longer underfundedA new survey finds that some businesses face growing debt ceiling pressure after a massive debt default in OctoberObama to block new money for emergency workers in nursing homes, schoolsWhite House agrees to stop sendi

In [None]:
# add the generated text to a json file, wich is used for the evaluation
import json
import os

def write_dict_to_json_file(file_path, data):
    # Check if the JSON file exists
    if os.path.exists(file_path):
        # If the file exists, load the existing data
        with open(file_path, 'r', encoding='utf-8') as file:
            existing_data = json.load(file)
    else:
        # If the file does not exist, create an empty dictionary
        existing_data = {}

    # Update the existing dictionary with the new data
    existing_data.update(data)

    # Write the updated dictionary back to the JSON file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False)

data_to_append = {
    "distillgpt2_left_headlines": generated_text_dict
}

# write data to json file
write_dict_to_json_file('path_to_json_file/generated_texts.json', data_to_append)