In [1]:
import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

In [4]:
from torch.utils.data import Dataset

## Dataset

I've chosen [BBC news dataset](https://www.kaggle.com/datasets/pariza/bbc-news-summary). I selected the sports, tech, and business categories, which together comprise 1422 news articles.

Description of the dataset: This dataset was created using a dataset used for data categorization that onsists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005 used in the paper of D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006; whose all rights, including copyright, in the content of the original articles are owned by the BBC.

Creating dataframe based on txt files

In [5]:
def load_texts_from_dirs(dirs):
    texts = []
    for datadir in dirs:
        for txtfile in Path(datadir).glob("*.txt"):
            texts.append(txtfile.read_text(errors='ignore'))
    return texts

In [6]:
data_dirs = [
    Path("/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/bbc_news/BBC News Summary/News Articles/sport/"),
    Path("/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/bbc_news/BBC News Summary/News Articles/tech/"),
    Path("/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/bbc_news/BBC News Summary/News Articles/business/"),
]


In [7]:
all_texts = load_texts_from_dirs(data_dirs)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_texts, eval_texts = train_test_split(all_texts, test_size=0.1, random_state=42)

In [10]:
model_name = "distilgpt2"

Description of the pr-trained model:
DistilGPT2 (short for Distilled-GPT2) is an English-language model pre-trained with the supervision of the smallest version of Generative Pre-trained Transformer 2 (GPT-2).

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [13]:
model = AutoModelForCausalLM.from_pretrained(model_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
class TextLMBackingDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = enc.input_ids.squeeze()
        attention_mask = enc.attention_mask.squeeze()
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone()
        }

In [15]:
train_dataset = TextLMBackingDataset(train_texts, tokenizer)
eval_dataset = TextLMBackingDataset(eval_texts, tokenizer)

In [16]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [17]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)


In [19]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtiyunes[0m ([33mtiyunes-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,3.1491,3.172013


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=960, training_loss=3.1871498425801597, metrics={'train_runtime': 254.7495, 'train_samples_per_second': 15.062, 'train_steps_per_second': 3.768, 'total_flos': 250648908005376.0, 'train_loss': 3.1871498425801597, 'epoch': 3.0})

In [20]:
eval_results = trainer.evaluate()

In [21]:
trainer.save_model(training_args.output_dir)

In [22]:
tokenizer.save_pretrained(training_args.output_dir)

('/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/tokenizer_config.json',
 '/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/special_tokens_map.json',
 '/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/vocab.json',
 '/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/merges.txt',
 '/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/added_tokens.json',
 '/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/tokenizer.json')

In [44]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"trainable parameters: {trainable_params}")

trainable parameters: 81912576


In [24]:
print("eval perplexity:", torch.exp(torch.tensor(eval_results["eval_loss"])))

eval perplexity: tensor(23.8555)


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [28]:
correct_phrases = [
    "the central bank raised interest rates",
    "the mayor unveiled the new infrastructure plan",
    "researchers reported a breakthrough in cancer treatment",
    "oil prices surged amid supply concerns",
    "the international summit concluded with a joint statement"
]

In [29]:
incorrect_phrases = [
    "bank central the raised rates interest",
    "mayor the plan unveiled infrastructure new the",
    "researchers a in cancer reported breakthrough treatment",
    "surged oil prices amid concerns supply",
    "joint concluded summit international the with statement a"
]

In [36]:
def perp_eval(model, tokenizer, text, device):
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        loss = model(**enc, labels=enc["input_ids"]).loss
    return math.exp(loss.item())

In [31]:
import math

In [37]:
ppl_correct = []
for sent in correct_phrases:
    pp_ft = perp_eval(model, tokenizer, sent, device)
    ppl_correct.append(pp_ft)
    print(f"\n{sent}")
    print(f"Finetuned GPT-2: {pp_ft}")


the central bank raised interest rates
Finetuned GPT-2: 37.98008991281016

the mayor unveiled the new infrastructure plan
Finetuned GPT-2: 223.35457868615296

researchers reported a breakthrough in cancer treatment
Finetuned GPT-2: 105.31965562751718

oil prices surged amid supply concerns
Finetuned GPT-2: 675.705333547302

the international summit concluded with a joint statement
Finetuned GPT-2: 162.2422882043793


In [38]:
ppl_incorrect = []
for sent in incorrect_phrases:
    pp_ft = perp_eval(model, tokenizer, sent, device)
    ppl_incorrect.append(pp_ft)
    print(f"\n{sent}")
    print(f"Finetuned GPT-2: {pp_ft}")


bank central the raised rates interest
Finetuned GPT-2: 9788.96118527996

mayor the plan unveiled infrastructure new the
Finetuned GPT-2: 4273.027442692105

researchers a in cancer reported breakthrough treatment
Finetuned GPT-2: 3341.715887043425

surged oil prices amid concerns supply
Finetuned GPT-2: 693.9261557276558

joint concluded summit international the with statement a
Finetuned GPT-2: 14627.870656921175


In [39]:
print(f"average perplexity on correct phrases for fine-tuned model: {np.mean(ppl_correct)}")
print(f"average perplexity on incorrect phrases for fine-tuned model: {np.mean(ppl_incorrect)}")

average perplexity on correct phrases for fine-tuned model: 240.92038919563228
average perplexity on incorrect phrases for fine-tuned model: 6545.100265532865


In [40]:
prompts = [
    "I think",
    "She goes to",
    "In the future",
    "Blue dog",
    "Once upon a time"
]

In [43]:
for prompt in prompts:
    inp = tokenizer(prompt, return_tensors="pt").to(device)
    out = model.generate(
        **inp,
        max_new_tokens=30,
        do_sample=True,
        top_k=50,
        top_p=0.95,
    )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    print(f"{text}")
    print("-" * 50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I think the future of our music will continue to be a struggle for music lovers and fans alike.

The UK's oldest independent record label has begun to
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


She goes to prison and gets $1 million after filing suit

David Davis has been sentenced to life in prison for fraud, a US appeals court has found.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In the future has seen the emergence of the film industry as a multi-million dollar force of events and companies are turning their backs on technology.

Ahead
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Blue dog in London

Two of Britain's biggest companies are to announce its retirement on Wednesday - a move which has alarmed some campaigners.

It is
--------------------------------------------------
Once upon a time when a film could be called an English text, how many of the same would be translated into English today? Of course, the most basic definition is
--------------------------------------------------
