In [1]:
%%time
%%capture

# Install required packages
!pip install pyarrow==6.0.0
# !pip install transformers
# !pip install -U sentence-transformers
# !pip install datasets

# !pip install fairseq

CPU times: user 277 ms, sys: 90.8 ms, total: 368 ms
Wall time: 49.4 s


In [1]:
!pip install pyarrow==6.0.0

import pyarrow
pyarrow.__version__

Collecting pyarrow==6.0.0
  Downloading pyarrow-6.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.5 MB)
[K     |████████████████████████████████| 25.5 MB 639 kB/s eta 0:00:01
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 1.0.1
    Uninstalling pyarrow-1.0.1:
      Successfully uninstalled pyarrow-1.0.1
Successfully installed pyarrow-6.0.0


'1.0.1'

In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from tqdm import tqdm

from datasets import load_dataset

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
To use `datasets`, the module `pyarrow>=6.0.0` is required, and the current version of `pyarrow` doesn't match this condition.
If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`.

In [None]:
!nvidia-smi

In [7]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fe43c145570>

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Embedding(50259, 1024)

### Configurations

In [9]:
DATA_PATH = '../input/netflix-shows/netflix_titles.csv'
DATA_HEADER = 'description'

OUTPUT_DIR = './results'
LOGGING_DIR = './logs'

EPOCHS = 4

LOGGING_STEPS = 100
SAVE_STEPS = 1000

TRAIN_BATCH_SIZE = 16 
EVAL_BATCH_SIZE = 16

WARMUP_STEPS = 10

WEIGHT_DECAY = 0.05

REPORT_TO = 'none'

In [10]:
descriptions = pd.read_csv(DATA_PATH)[DATA_HEADER]

In [11]:
descriptions

0       As her father nears the end of his life, filmm...
1       After crossing paths at a party, a Cape Town t...
2       To protect his family from a powerful drug lor...
3       Feuds, flirtations and toilet talk go down amo...
4       In a city of coaching centers known to train I...
                              ...                        
8802    A political cartoonist, a crime reporter and a...
8803    While living alone in a spooky town, a young g...
8804    Looking to survive in a world taken over by zo...
8805    Dragged from civilian life, a former superhero...
8806    A scrappy but poor boy worms his way into a ty...
Name: description, Length: 8807, dtype: object

In [12]:
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [13]:
class TrainDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [14]:
dataset = TrainDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [15]:
# for garbage collection

import gc
gc.collect()

22

In [16]:
torch.cuda.empty_cache()

In [17]:
training_args = TrainingArguments(output_dir=OUTPUT_DIR, num_train_epochs=EPOCHS, logging_steps=LOGGING_STEPS, 
                                  save_steps=SAVE_STEPS, per_device_train_batch_size=TRAIN_BATCH_SIZE, 
                                  per_device_eval_batch_size=EVAL_BATCH_SIZE, warmup_steps=WARMUP_STEPS, 
                                  weight_decay=WEIGHT_DECAY, logging_dir=LOGGING_DIR, report_to = REPORT_TO)


In [18]:
model_trainer = Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})
model_trainer.train()

Step,Training Loss
100,4.7043
200,1.7465
300,1.7273
400,1.7106
500,1.6896
600,1.5112
700,1.5079
800,1.503
900,1.4959
1000,1.4701


TrainOutput(global_step=1984, training_loss=1.6095008850097656, metrics={'train_runtime': 980.9395, 'train_samples_per_second': 2.023, 'total_flos': 4184768857079808.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 56925, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 18306, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 615231, 'train_mem_gpu_alloc_delta': 4264523264, 'train_mem_cpu_peaked_delta': 413006286, 'train_mem_gpu_peaked_delta': 4275549184})

In [19]:
model_trainer.save_model("/final_model.bin")

### GPT Generated Description

In [20]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [None]:
# fetched_model = AutoModelForSequenceClassification.from_pretrained("/final_model.bin")

In [21]:
all_samples = []

In [22]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=150, top_p=0.95, temperature=1.9, num_return_sequences=100)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [25]:
all_samples = []
for i in tqdm(range(10)):
    sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=150, top_p=0.95, temperature=1.9, num_return_sequences=100)
    all_samples.append(sample_outputs)

  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 1/10 [00:04<00:42,  4.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 2/10 [00:10<00:43,  5.42s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 3/10 [00:16<00:40,  5.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 4/10 [00:21<00:32,  5.41s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 5/10 [00:27<00:27,  5.54s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 6/10 [00:33<00:22,  5.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 70%|███████   | 7/10 [00:41<00:19,  6.52s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|████████  | 8/10 [00:46<00:12,  6.01s/it]Setting `pad_token

In [29]:
all_samples_list = []
for s in all_samples:
  all_samples_list.extend(s)
len(all_samples_list)  

1000

In [30]:
sample_decode_output = [tokenizer.decode(sample_output, skip_special_tokens=True) for sample_output in all_samples_list]

In [32]:
df = pd.DataFrame(columns=['sentence']) 
df['sentence'] = sample_decode_output
df

Unnamed: 0,sentence
0,??????? The daughter of a prominent oil and ga...
1, Inspired by Thai myths Fierce martial-arts...
2, 18000: The biggest busts in sport entertain...
3,‌Funny Money Radio”” host Jack Dohring joins ...
4,"Â Áne enfants d'esprit gros le monde, françai..."
...,...
995,"________""Killer of ninjas: Ig Igl and brother..."
996,イ members Aijara and Shuso‼ll have fun making...
997,ʒ’ʌ‍ ʑ͜ ̶͡‵͟༽͜ل͢°) on the verge of death str...
998,Xyla seeks to unite the lives of three gene...


In [33]:
df.to_csv('generated.csv', index=False)

### Original Description (Random)

In [None]:
pd.options.display.max_colwidth = 1000
descriptions.sample(10)