In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("ag_news")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [4]:
filtered_datasets = raw_datasets.filter(lambda example: example["label"]==2)
filtered_datasets = filtered_datasets.remove_columns("label")

In [5]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from transformers import AutoModelForCausalLM

model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = (
    tokenizer.eos_token
)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [7]:
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized_datasets = filtered_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)
tokenized_datasets

Map: 100%|███████████████████████████████████████████████████████████████| 1900/1900 [00:00<00:00, 12273.42 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1900
    })
})

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [9]:
samples = [tokenized_datasets["train"][i] for i in range(3)]

for sample in samples:
    print(f"input_ids shape: {len(sample['input_ids'])}")

input_ids shape: 37
input_ids shape: 55
input_ids shape: 51


In [10]:
out = data_collator(samples)
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([3, 55])
attention_mask shape: torch.Size([3, 55])
labels shape: torch.Size([3, 55])


In [21]:
from transformers import TrainingArguments

model_id = "gpt2_agnews_finetuned"
training_args = TrainingArguments(
    model_id,
    push_to_hub=False,
    per_device_train_batch_size=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
)

In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"].select(range(5000)),
    eval_dataset=tokenized_datasets["test"],
    
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
trainer.train()

Step,Training Loss,Validation Loss
200,1.7575,3.987741
400,1.7657,3.911576
600,1.8068,3.874471
800,0.9551,4.428445
1000,1.0157,4.243545
1200,1.1994,4.126361


TrainOutput(global_step=1250, training_loss=1.410815948486328, metrics={'train_runtime': 220.0843, 'train_samples_per_second': 45.437, 'train_steps_per_second': 5.68, 'total_flos': 467451445248000.0, 'train_loss': 1.410815948486328, 'epoch': 2.0})

In [24]:
trainer.save_model()

In [19]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./sft_cml4", device=device)
pipe.tokenizer.pad_token_id = 50256

In [25]:
print(pipe("Q1", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])

Q1 income boosts ASEAN 10Q income has risen, helped by the recent gain in Asia #39;s second- biggest economy, by 144 million, to A\$112 million US a year ago, helping the country's public sector
