In [1]:
%%capture
!pip install -U datasets transformers accelerate

In [16]:
!pip install peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0


In [67]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_ckpt = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id  

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
# for name, module in model.named_modules():
#     print(name)

In [68]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(r=4, target_modules='transformer\.h\..*\.attn\.c_.*',)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 221,184 || all params: 124,662,528 || trainable%: 0.17742621102629974


In [32]:
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/imdb")

def tokenize(batch):
    tokenized = tokenizer(batch["text"], max_length=256, truncation=True, padding=True)
    return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}


ds_train = dataset['train'].shuffle().select(range(10000))
ds_test = dataset['test'].shuffle().select(range(2500))

ds_train = ds_train.map(tokenize, batched=True)
ds_test = ds_test.map(tokenize, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [33]:
import numpy as np
from datasets import load_metric
from transformers import TrainingArguments, Trainer

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    predictions, label_ids = eval_pred.predictions, eval_pred.label_ids
    predictions = predictions.argmax(axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=label_ids)
    f1 = f1_metric.compute(predictions=predictions, references=label_ids, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"], 
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [61]:
from transformers import Trainer, TrainingArguments

training_arguments = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-3,
    weight_decay=1e-5,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    report_to="none"
) 

trainer = Trainer(
    model=model,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    args=training_arguments,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3304,0.248348,0.898,0.897971
2,0.2508,0.225742,0.9088,0.908759
3,0.2137,0.220402,0.908,0.907959
4,0.1894,0.23226,0.9176,0.91759
5,0.1615,0.228126,0.912,0.911999
6,0.1363,0.276388,0.9016,0.901377
7,0.1162,0.272233,0.9112,0.911188
8,0.1008,0.273334,0.9124,0.9124
9,0.0842,0.302037,0.9112,0.91116
10,0.0723,0.311203,0.908,0.907998


TrainOutput(global_step=3130, training_loss=0.16555457556971345, metrics={'train_runtime': 2366.4893, 'train_samples_per_second': 42.257, 'train_steps_per_second': 1.323, 'total_flos': 1.3098811392e+16, 'train_loss': 0.16555457556971345, 'epoch': 10.0})