<a href="https://colab.research.google.com/github/toddpglidden/toddpglidden/blob/main/fine_tuning_distilbert_airline_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/blog/sentiment-analysis-python

In [None]:
# Check device
import torch
torch.cuda.is_available()

In [1]:
! pip install datasets
! pip install evaluate

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [3]:
# Bring in data
ds = load_dataset("osanseviero/twitter-airline-sentiment")
ds = ds['train']

README.md:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

Tweets.csv:   0%|          | 0.00/3.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14640 [00:00<?, ? examples/s]

In [4]:
ds

Dataset({
    features: ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'],
    num_rows: 14640
})

In [5]:
# Just keep the text and the label (airline sentiment) columns
ds= ds.select_columns(['text', 'airline_sentiment'])
ds = ds.rename_column('airline_sentiment', 'label')

In [6]:
ds

Dataset({
    features: ['text', 'label'],
    num_rows: 14640
})

In [7]:
# Create train and test sets
ds = ds.train_test_split(test_size= 0.2)

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11712
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2928
    })
})

In [9]:
train = ds['train']
test = ds['test']

In [10]:
train, test

(Dataset({
     features: ['text', 'label'],
     num_rows: 11712
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 2928
 }))

In [11]:
# Get the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [12]:
# Preprocess the data
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

Map:   0%|          | 0/11712 [00:00<?, ? examples/s]

Map:   0%|          | 0/2928 [00:00<?, ? examples/s]

In [13]:
# Create collator (Puts data into tensors and adds padding)
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# Get pre-trained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Evaluation function
import numpy as np
from evaluate import load

def compute_metrics(eval_pred):
    accuracy_metric = load("accuracy")
    f1_metric = load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": accuracy, "f1": f1}

In [16]:
# Log into HF account
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
# Set up trainer
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-distilbert-airline_tweets"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset= train,
   eval_dataset= test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [18]:
# Train the model
trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [None]:
# Evaluation
trainer.evaluate()

In [None]:
# Put the fine-tuned model on the hub
trainer.push_to_hub()

In [None]:
# Get some new reviews
rev1 = imdb['unsupervised'][4782]

In [None]:
rev1

In [None]:
rev2 = imdb['unsupervised'][3748]

In [None]:
rev2

In [None]:
# Feed new reviews to the model
from transformers import pipeline

sentiment_model = pipeline(model="toddglidden/finetuning-sentiment-model-3000-samples")
sentiment_model([rev1['text'], rev2['text']])