In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple

In [2]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [3]:
train_dataset = pd.read_csv('/content/drive/MyDrive/rotten-tomatoes/train.csv').drop('label', axis=1)
val_dataset = pd.read_csv('/content/drive/MyDrive/rotten-tomatoes/validation.csv').drop('label', axis=1)
test_dataset = pd.read_csv('/content/drive/MyDrive/rotten-tomatoes/test.csv').drop('label', axis=1)

In [4]:
#Drop too long texts 
train_dataset = train_dataset[train_dataset['text'].apply(lambda x: len(x.split(' ')) < 350)]
val_dataset = val_dataset[val_dataset['text'].apply(lambda x: len(x.split(' ')) < 350)]
test_dataset = test_dataset[test_dataset['text'].apply(lambda x: len(x.split(' ')) < 350)]

#Drop too short texts 
train_dataset = train_dataset[train_dataset['text'].apply(lambda x: len(x.split(' ')) > 8)]
val_dataset = val_dataset[val_dataset['text'].apply(lambda x: len(x.split(' ')) > 8)]
test_dataset = test_dataset[test_dataset['text'].apply(lambda x: len(x.split(' ')) > 8)]

train_dataset = pd.concat([train_dataset, val_dataset], ignore_index=True)
train_dataset = train_dataset.reset_index(drop=True)

In [5]:
test_dataset['true_end_text'] = test_dataset['text'].str.split().str[5:].apply(' '.join)
test_dataset['text'] = test_dataset['text'].str.split().str[:5].apply(' '.join)

In [6]:
class Reviews(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.text = []

        for row in train_dataset['text']:
          self.text.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.text = self.text[:20000]
        self.text_count = len(self.text)
        
    def __len__(self):
        return self.text_count

    def __getitem__(self, item):
        return self.text[item]
    
train_dataset = Reviews(train_dataset['text'], truncate=True, gpt2_type="gpt2")   

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [9]:
model = train(train_dataset, model, tokenizer)



Training epoch 0
0


8730it [08:34, 16.95it/s]


Training epoch 1
tensor(0.3419, device='cuda:0', grad_fn=<NllLossBackward0>)


8730it [08:33, 17.01it/s]


Training epoch 2
tensor(0.7196, device='cuda:0', grad_fn=<NllLossBackward0>)


8730it [08:35, 16.95it/s]


Training epoch 3
tensor(0.4553, device='cuda:0', grad_fn=<NllLossBackward0>)


8730it [08:33, 17.01it/s]


Training epoch 4
tensor(0.6229, device='cuda:0', grad_fn=<NllLossBackward0>)


8730it [08:32, 17.02it/s]


In [10]:
torch.save(model, '/content/drive/MyDrive/rotten-tomatoes/gpt2-model.pt')

# model.load_state_dict(torch.load('/content/drive/MyDrive/rotten-tomatoes/gpt2-model.pt'))
# model.eval()

In [11]:
test_dataset = test_dataset.reset_index(drop=True)

In [27]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, 
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:
                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

def text_generation(test_dataset):
  generated_text = []
  for i in range(len(test_dataset)):
    x = generate(model.to('cpu'), tokenizer, test_dataset['text'][i], entry_count=1)
    generated_text.append(x)
  return generated_text

generated_text = text_generation(test_dataset[:50])

100%|██████████| 1/1 [00:04<00:00,  4.82s/it]
100%|██████████| 1/1 [00:07<00:00,  7.25s/it]
100%|██████████| 1/1 [00:06<00:00,  6.01s/it]
100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
100%|██████████| 1/1 [00:02<00:00,  2.51s/it]
100%|██████████| 1/1 [00:07<00:00,  7.89s/it]
100%|██████████| 1/1 [00:06<00:00,  6.25s/it]
100%|██████████| 1/1 [00:05<00:00,  5.71s/it]
100%|██████████| 1/1 [00:07<00:00,  7.09s/it]
100%|██████████| 1/1 [00:04<00:00,  4.93s/it]
100%|██████████| 1/1 [00:04<00:00,  4.83s/it]
100%|██████████| 1/1 [00:08<00:00,  8.00s/it]
100%|██████████| 1/1 [00:03<00:00,  3.96s/it]
100%|██████████| 1/1 [00:07<00:00,  7.02s/it]
100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
100%|██████████| 1/1 [00:04<00:00,  4.51s/it]
100%|██████████| 1/1 [00:04<00:00,  4.45s/it]
100%|██████████| 1/1 [00:05<00:00,  5.92s/it]
100%|██████████| 1/1 [00:04<00:00,  4.49s/it]
100%|██████████| 1/1 [00:04<00:00,  4.35s/it]
100%|██████████| 1/1 [00:06<00:00,  6.31s/it]
100%|██████████| 1/1 [00:03<00:00,

In [13]:
import pickle
pickle.dump(generated_text, open("/content/drive/MyDrive/rotten-tomatoes/generated_text.p", "wb" ))

# generated_text = pickle.load(open( "/content/drive/MyDrive/rotten-tomatoes/generated_text.p", "rb" ))

In [32]:
test_dataset2 = test_dataset[:50]

In [33]:
my_generations=[]

for i in range(len(generated_text)):
  a = test_dataset2['text'][i].split() #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_text[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])

test_dataset2['generated_text'] = my_generations


#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(test_dataset2)):
  to_remove = test_dataset2['generated_text'][i].split('.')[-1]
  final.append(test_dataset2['generated_text'][i].replace(to_remove,''))

test_dataset2['generated_text'] = final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset2['generated_text'] = my_generations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset2['generated_text'] = final


In [34]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_dataset2)):
  if len(test_dataset2['true_end_text'][i]) == 0 or len(test_dataset2['generated_text'][i]) == 0:
    # scores.append(0)
    pass
  else:
    reference = test_dataset2['true_end_text'][i]
    candidate = test_dataset2['generated_text'][i]
    scores.append(sentence_bleu(reference, candidate))

print('BLEU: ', statistics.mean(scores))

BLEU:  1.235711239442837e-231


In [35]:
# Sprawdzilem pare wersji, generalnie BLEU zawsze miałem bliskie zeru, przy tej ograniczylem zbior testowy z racji tego ze BLEU nie jest do końca dobra metryka
# Generalnie uważam że jest za mało danych żeby sensownie nauczyć model generacji tekstu. Zbiór danych który wybrałem jest dość zróżnicowany mimo swojej domeny, nie wyraża tylko opinii pozytywnej/negatywnej.

test_dataset2

Unnamed: 0,text,true_end_text,generated_text
0,lovingly photographed in the manner,"of a golden book sprung to life , stuart littl...","that Robert Conquest was proud to show off, w..."
1,"it's like a "" big","chill "" reunion of the baader-meinhof gang , o...",
2,the story gives ample opportunity,"for large-scale action and suspense , which di...",
3,fresnadillo has something serious to,say about the ways in which extravagant chance...,say about that.
4,throws in enough clever and,unexpected twists to make the formula feel fre...,timely twists to shake up the mainstream narr...
5,weighty and ponderous but every,bit as filling as the treat of the title .,dollar worth of cash flowing in and out of th...
6,a real audience-pleaser that will,strike a chord with anyone who's ever waited i...,
7,generates an enormous feeling of,empathy for its characters .,"connection, such that you can never go back a..."
8,exposing the ways we fool,ourselves is one hour photo's real strength .,ourselves into believing that.\n\n1. Grins: A...
9,it's up to you to,decide whether to admire these people's dedica...,come up with a different idea of what 'damagi...
