# Tune GPT2 to generate positive tweets


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pip install transformers trl wandb datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

In [4]:
config = PPOConfig(
    model_name="redrussianarmy/gpt2-turkish-cased",
    learning_rate=1e-5,
    log_with="wandb",
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 64}

In [5]:
import wandb

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33myankihue[0m ([33mdata-aug-rlhf[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset("yankihue/tweets-turkish", column_names=['Paylaşım','Tip'], split="train")
    ds = ds.rename_columns({"Paylaşım": "tweet", "Tip": "sentiment"})


    ds = ds.filter(lambda x: len(x["tweet"]) > 30, batched=False)
    input_size = LengthSampler(input_min_text_length, input_max_text_length)
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["tweet"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [7]:
dataset = build_dataset(config)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
dataset = dataset.select(range(5000))
dataset.num_rows


5000

In [9]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)


#
#tokenizer.pad_token = tokenizer.eos_token
#

tokenizer.eos_token_id = model.config.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [11]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
sentiment_pipe = pipeline("sentiment-analysis", "savasy/bert-base-turkish-sentiment-cased", device=device)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [12]:
text = "rezalet bir olay gerçekten ne diyeceğimi bilemiyorum"
sentiment_pipe(text, **sent_kwargs)



[[{'label': 'negative', 'score': 3.1516218185424805},
  {'label': 'positive', 'score': -3.5145604610443115}]]

In [13]:
text = "harikasın ya iyi ki varsın abi gerçekten"
sentiment_pipe(text, **sent_kwargs)

[[{'label': 'negative', 'score': -2.506866693496704},
  {'label': 'positive', 'score': 2.9172959327697754}]]

In [14]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

In [15]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
19it [41:56, 132.44s/it]


In [16]:
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results




Unnamed: 0,query,response (before),response (after),rewards (before),rewards (after)
0,Kızların,mevcut yerine harçlık yardımı koyabilir.,"unutulmaz yorumu, muhteşem bir gösteri.\n",-2.566285,2.827657
1,twitter bile 280 karakter,"dinleyip izledi. Fakat, Mille",ile her daim birbirinin far oluyor.,-2.656402,-2.002952
2,sinirlenince bile güzel,leşmek istemiyordu. Bir ara oyun oynuyordum. A...,tanımak için sizin için güzelliğiyle bir albü...,-0.561078,2.929519
3,yürüse bile,"vakit geçirmek isterseniz,",herkes için birebir!,-2.456841,2.343866
4,Bukadar O,Tekrar Erkek Arkadaşları Hedef Göster,kadar mükemmelsiniz. ;) S,-3.690734,4.088912
5,Artık yeter dedirtti!,Medine Belediyesi 'Yemek alımı çok fazla mesl...,"\nİki örnekle, Türkiye Hükümetimize bir üst pe...",0.202897,-0.768727
6,lan geriz,ekalı bozmadı hipnodu olduran hoy prosa.,.) Bu unutulmaz günleri anlatıyor.\n“Hayat boy...,-3.175017,3.067518
7,hepinizin bildiği,"bir masal gibi. geçtiğimiz günlerde, enzim si...",ve her kitabı büyük bir çalışmayla birlikte m...,2.530687,1.526677
8,Sen hayatın boyunca 24 saat Amerik,anın yatıp nereye gün battığından,"ayı, çok seviyorum. Çok",-0.58478,2.743369
9,seni sana anlatsam,.Açsa babam çıkmasın.Gayet hijyen aşkı had safh,......:)) �ler):)) İşte olur. Çok seri!,-0.573035,4.201299


In [17]:
df_results.to_csv()

',query,response (before),response (after),rewards (before),rewards (after)\n0,Kızların, mevcut yerine harçlık yardımı koyabilir.," unutulmaz yorumu, muhteşem bir gösteri.\n",-2.5662853717803955,2.8276572227478027\n1,twitter bile 280 karakter," dinleyip izledi. Fakat, Mille", ile her daim birbirinin far oluyor.,-2.6564016342163086,-2.0029516220092773\n2,sinirlenince bile güzel,leşmek istemiyordu. Bir ara oyun oynuyordum. Ansızın bacaklarım başıma, tanımak için sizin için güzelliğiyle bir albümde. nasıl müthiş bir müzikal! Çok,-0.561077892780304,2.929518699645996\n3,yürüse bile," vakit geçirmek isterseniz,", herkes için birebir!,-2.456841230392456,2.3438663482666016\n4,Bukadar O, Tekrar Erkek Arkadaşları Hedef Göster, kadar mükemmelsiniz. ;) S,-3.6907341480255127,4.088912010192871\n5,\x91Artık yeter\x92 dedirtti!, Medine Belediyesi \'Yemek alımı çok fazla mesleki yeterlilik ve mesleki yeterlilik Fark,"\nİki örnekle, Türkiye Hükümetimize bir üst perde. Uzun",0.20289736986160278,-0.768727

In [18]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


rewards (before)   -1.126199
rewards (after)     2.309025
dtype: float64


median:


rewards (before)   -0.655964
rewards (after)     2.878588
dtype: float64

In [19]:
from huggingface_hub import notebook_login, create_repo
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
model.save_pretrained("final-gpt2-tr-positive-sentiment-tweets-final", push_to_hub=True)
tokenizer.save_pretrained("final-gpt2-tr-positive-sentiment-tweets-final", push_to_hub=True)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]



('final-gpt2-tr-positive-sentiment-tweets-final/tokenizer_config.json',
 'final-gpt2-tr-positive-sentiment-tweets-final/special_tokens_map.json',
 'final-gpt2-tr-positive-sentiment-tweets-final/vocab.json',
 'final-gpt2-tr-positive-sentiment-tweets-final/merges.txt',
 'final-gpt2-tr-positive-sentiment-tweets-final/added_tokens.json',
 'final-gpt2-tr-positive-sentiment-tweets-final/tokenizer.json')