# Imports

In [14]:
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
import numpy as np
import ipywidgets.widgets as widgets
from IPython.display import display
import torch

# HuggingFace Basics
Following tutorial: https://huggingface.co/blog/sentiment-analysis-python
<br>
Pre-trained Sentiment Analysis Models

In [2]:
# Using basic built-in sentiment analysis pipeline
sentiment_pipeline = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
data = ["I love you", "I hate you", "I don't know how I feel about you", "I am neutral about you"]
sentiment_pipeline(data)

[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079},
 {'label': 'NEGATIVE', 'score': 0.9944603443145752},
 {'label': 'NEGATIVE', 'score': 0.9936702251434326}]

In [3]:
# Specifying model to be used
specific_model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")
specific_model(data)

[{'label': 'POS', 'score': 0.9916695356369019},
 {'label': 'NEG', 'score': 0.9806600213050842},
 {'label': 'NEG', 'score': 0.604575514793396},
 {'label': 'NEU', 'score': 0.9232634902000427}]

# Building a custom Sentiment Analysis Model

In [4]:
# Load data
imdb = load_dataset("imdb")
small_train_dataset = imdb["train"].shuffle(seed=42).select(range(3000))
small_test_dataset = imdb["test"].shuffle(seed=42).select(range(3000))

In [5]:
# Preprocess data with DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

In [6]:
# Use data collator to convert training samples to PyTorch tensors
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# Define DistilBERT as base model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define metrics to evaluate model
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Log in to HuggingFace account to manage model repos
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# Define training arguments
repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
# Define trainer
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [29]:
# Train model
trainer.train()

  0%|          | 0/376 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 15.02 GB, other allocations: 3.10 GB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).