In [None]:
!pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
imdb = load_dataset("imdb")

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenized_imdb = imdb.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2313,0.210296,0.91836
2,0.154,0.245181,0.92916


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2313,0.210296,0.91836
2,0.154,0.245181,0.92916
3,0.0921,0.284398,0.93164


TrainOutput(global_step=4689, training_loss=0.1697925076247736, metrics={'train_runtime': 4764.0807, 'train_samples_per_second': 15.743, 'train_steps_per_second': 0.984, 'total_flos': 9834539051060448.0, 'train_loss': 0.1697925076247736, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.284397691488266,
 'eval_accuracy': 0.93164,
 'eval_runtime': 404.3675,
 'eval_samples_per_second': 61.825,
 'eval_steps_per_second': 3.865,
 'epoch': 3.0}

In [None]:
text = "Honestly, don't know how to feel about it"

tokenizer = AutoTokenizer.from_pretrained("results/checkpoint-4689")
inputs = tokenizer(text, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-4689")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'NEGATIVE'

In [None]:
model.save_pretrained("./sentiment-model")
tokenizer.save_pretrained("./sentiment-tokenizer")

('./sentiment-tokenizer/tokenizer_config.json',
 './sentiment-tokenizer/special_tokens_map.json',
 './sentiment-tokenizer/vocab.txt',
 './sentiment-tokenizer/added_tokens.json',
 './sentiment-tokenizer/tokenizer.json')

In [None]:
!zip -r /content/sentiment_model.zip /content/sentiment-model/
!zip -r /content/sentiment_tokenizer.zip /content/sentiment-tokenizer/

updating: content/sentiment-model/ (stored 0%)
updating: content/sentiment-model/model.safetensors (deflated 8%)
updating: content/sentiment-model/config.json (deflated 45%)
  adding: content/sentiment-tokenizer/ (stored 0%)
  adding: content/sentiment-tokenizer/special_tokens_map.json (deflated 42%)
  adding: content/sentiment-tokenizer/tokenizer_config.json (deflated 76%)
  adding: content/sentiment-tokenizer/tokenizer.json (deflated 71%)
  adding: content/sentiment-tokenizer/vocab.txt (deflated 53%)
