<a href="https://colab.research.google.com/github/tinumide/AlphaQuest/blob/main/POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers

In [None]:
!pip install GPUtil

In [2]:
import torch
import numpy as np

from datasets import load_dataset
from transformers import (
    AdamW,
    AutoTokenizer, 
    DataCollatorWithPadding,
    get_scheduler,
    GPT2LMHeadModel,
    GPT2Tokenizer, 
)
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [None]:
dataset = load_dataset("deepmind/code_contests")

In [4]:
dataset = dataset.flatten()

In [5]:
dataset['train'].column_names

['name',
 'description',
 'public_tests.input',
 'public_tests.output',
 'private_tests.input',
 'private_tests.output',
 'generated_tests.input',
 'generated_tests.output',
 'source',
 'difficulty',
 'solutions.language',
 'solutions.solution',
 'incorrect_solutions.language',
 'incorrect_solutions.solution',
 'cf_contest_id',
 'cf_index',
 'cf_points',
 'cf_rating',
 'cf_tags',
 'is_description_translated',
 'untranslated_description',
 'time_limit.seconds',
 'time_limit.nanos',
 'memory_limit_bytes',
 'input_file',
 'output_file']

In [6]:
dataset = dataset.remove_columns(['name',
 'public_tests.input',
 'public_tests.output',
 'private_tests.input',
 'private_tests.output',
 'generated_tests.input',
 'generated_tests.output',
 'source',
 'difficulty',
 'solutions.language',
 'incorrect_solutions.language',
 'incorrect_solutions.solution',
 'cf_contest_id',
 'cf_index',
 'cf_points',
 'cf_rating',
 'cf_tags',
 'is_description_translated',
 'untranslated_description',
 'time_limit.seconds',
 'time_limit.nanos',
 'memory_limit_bytes',
 'input_file',
 'output_file'])

In [7]:
dataset = dataset.filter(lambda example: len(example['solutions.solution']) > 0)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', add_bos_token = True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def tokenize_data(example):
  return tokenizer(example["description"], example["solutions.solution"], max_length=1024, truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_data, batched=True)

In [11]:
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['description', 'solutions.solution', 'input_ids', 'attention_mask']

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(["description", "solutions.solution"])

In [62]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=1, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["valid"], batch_size=1, collate_fn=data_collator
)

In [63]:
tokenizer.pad_token = tokenizer.eos_token
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([1, 1024]), 'attention_mask': torch.Size([1, 1024])}

In [65]:
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 1024)

In [66]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [69]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [70]:
progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/26268 [00:00<?, ?it/s]

OutOfMemoryError: ignored

In [33]:
prompt = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [36]:
sample_outputs = model.generate(prompt, do_sample=True, top_k=50, 
                                max_length=500, top_p=0.95, num_return_sequences=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [37]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output)))

0: <|startoftext|> <|startoftext|>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
1: <|startoftext|> <|startoftext|>      

Input

The input is as shown in the first sample in Fig. 2.

The input consists of multiple datasets. Each dataset is denoted with an integer between 1 and 20.

Output

The output of the input is as shown in the third sample in Fig. 3.

It is guaranteed that the result is lexicographically less than 20. The output is guaranteed to be lexicographically less than 25.

Example

Input

10


Output

20

Input

5 6


Output

2