# Tune GPT2 to generate economy-related news 



In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pip install transformers trl wandb datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.4.4-py3-none-any.whl (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.4/68.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.15.4-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.

In [3]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

In [4]:
config = PPOConfig(
    model_name="redrussianarmy/gpt2-turkish-cased",
    learning_rate=1.e-5,
    log_with="wandb",
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [5]:
import wandb

wandb.init()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset("yankihue/turkish-news-categories", column_names=['headline','category'], split="train")
    ds = ds.filter(lambda x: len(x["headline"]) > 10, batched=False)
    ds = ds.map(lambda x: {"headline": x["headline"][:250]}, batched=False)
    ds = ds.filter(lambda x: x["category"] == "siyaset" or x["category"] == "kultur", batched=False)


    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["headline"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [22]:
dataset = build_dataset(config)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
dataset.num_rows


2973

In [1]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.eos_token_id = model.config.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

NameError: ignored

In [9]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

In [10]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
classification_pipe = pipeline("sentiment-analysis", "savasy/bert-turkish-text-classification", device=device,return_all_scores=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [11]:
text = "selahattin demirtaş açıklama yaptı: başbakan gidici"
output = classification_pipe(text)
output

[[{'label': 'world', 'score': 0.0002010414027608931},
  {'label': 'economy', 'score': 0.00039737450424581766},
  {'label': 'culture', 'score': 0.0003916260611731559},
  {'label': 'health', 'score': 0.00016651807527523488},
  {'label': 'politics', 'score': 0.9981176853179932},
  {'label': 'sport', 'score': 0.00043175750761292875},
  {'label': 'technology', 'score': 0.00029402965446934104}]]

In [12]:
text = "Apple, yeni işlemcilerini tanıttı"
output = classification_pipe(text)
output

[[{'label': 'world', 'score': 0.0006622957298532128},
  {'label': 'economy', 'score': 0.0009467191994190216},
  {'label': 'culture', 'score': 0.0002528876066207886},
  {'label': 'health', 'score': 0.00041032774606719613},
  {'label': 'politics', 'score': 0.0008105904562398791},
  {'label': 'sport', 'score': 0.00032685601036064327},
  {'label': 'technology', 'score': 0.9965903759002686}]]

In [13]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

In [14]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = classification_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"])for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
11it [16:11, 88.31s/it]


In [1]:
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in classification_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in classification_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results


NameError: ignored

In [16]:
df_results.to_csv()

',query,response (before),response (after),rewards (before),rewards (after)\n0,Eski İçişleri Bakanı Şahin\'e coşkulu," bir mektup yazarak ""Yaralananlar davaya gelmeyeceğini söylüyorlar"," ve demokratik fiyatlar yapıldı. Vali çalışmaları, Endonezya\'nın",-0.7360561490058899,0.5865375399589539\n1,Huzur Sokağı duygulandırdı,".\nSivas\'ta işyer", TÜV Osmaniye Sigortacılık ve Toplu Konut,-0.061551667749881744,6.187183380126953\n2,Sümer Ezgü:," “Kıbrıs Türklüğünün onarımı kadar,", Sanayi maliyetlerini minimum onlar kadar değer.,-0.7078996896743774,6.503681659698486\n3,Samsun\'da, Trabzonspor deplasmanında ilk yarıyı, düzenlenen ihracatların yüzde 9,-0.9319600462913513,3.6198112964630127\n4,Başbakan Erdoğan," isterse şunları söyledi: “Hem İngiltere hem de ABD, veranda",", ""Kamu Gayrimenkul Geliştirme, Finansal ve Finansal İşletme ve Dışında",-0.5470756888389587,7.035712242126465\n5,FLAŞ! Kabinede değişiklik, yok Kabineinleşmeyen sinir sistemi zayıf Şair ‘KIZLIKLAR," arz ediyor, şirketlerin 

In [17]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


rewards (before)   -0.553348
rewards (after)     4.374639
dtype: float64


median:


rewards (before)   -0.721978
rewards (after)     5.523491
dtype: float64

In [20]:
from huggingface_hub import notebook_login, create_repo
notebook_login()
model.save_pretrained("gpt2-tr-uncontrolled-classification-news-economics-final", push_to_hub=True)
tokenizer.save_pretrained("gpt2-tr-uncontrolled-classification-news-economics-final", push_to_hub=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]



('gpt2-tr-uncontrolled-classification-news-economics-final/tokenizer_config.json',
 'gpt2-tr-uncontrolled-classification-news-economics-final/special_tokens_map.json',
 'gpt2-tr-uncontrolled-classification-news-economics-final/vocab.json',
 'gpt2-tr-uncontrolled-classification-news-economics-final/merges.txt',
 'gpt2-tr-uncontrolled-classification-news-economics-final/added_tokens.json',
 'gpt2-tr-uncontrolled-classification-news-economics-final/tokenizer.json')