In [1]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#import and process data

In [2]:
from datasets import load_dataset

dataset = load_dataset("glue","sst2", split="train[:1200]")
dataset[:11]



{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature ',
  'remains utterly satisfied to remain the same throughout ',
  'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
  "that 's far too tragic to merit such superficial treatment ",
  'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
  'of saucy ',
  "a depressed fifteen-year-old 's suicidal poetry ",
  "are more deeply thought through than in most ` right-thinking ' films ",
  'goes to absurd lengths '],
 'label': [0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0],
 'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
  return tokenizer(examples["sentence"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [4]:
tokenized_datasets = tokenized_datasets.remove_columns(["idx","sentence"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1200
})

# split the tokenized data

In [5]:
train_dataset = tokenized_datasets.shuffle(seed=42).select(range(0,960))
eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(960,1200))



#Initialize the model

In [6]:
vocab_size = tokenizer.vocab_size

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("sshleifer/tiny-gpt2", num_labels=2, vocab_size= vocab_size, pad_token_id=tokenizer.eos_token_id)

Some weights of the model checkpoint at sshleifer/tiny-gpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Metrics

In [8]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

  after removing the cwd from sys.path.


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", num_train_epochs=50, per_device_train_batch_size=64, per_device_eval_batch_size=64, evaluation_strategy="epoch",logging_dir="test_trainer")

### Trainer

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
import torch
torch.cuda.empty_cache()


In [13]:
trainer.train()

***** Running training *****
  Num examples = 960
  Num Epochs = 50
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.694724,0.470833
2,No log,0.694288,0.483333
3,No log,0.694154,0.483333
4,No log,0.693891,0.483333
5,No log,0.693313,0.504167
6,No log,0.692679,0.520833
7,No log,0.692464,0.525
8,No log,0.692246,0.529167
9,No log,0.692173,0.520833
10,No log,0.692121,0.520833


***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evaluation *****
  Num examples = 240
  Batch size = 64
***** Running Evalua

TrainOutput(global_step=750, training_loss=0.6914107259114584, metrics={'train_runtime': 162.8357, 'train_samples_per_second': 294.776, 'train_steps_per_second': 4.606, 'total_flos': 46006272000.0, 'train_loss': 0.6914107259114584, 'epoch': 50.0})