# A tutorial on how to build Tweet Sentiment Analyser

## Installing Libraries

In [None]:
# Install torch, transformers and the datasets library
!pip install torch "transformers[sentencepiece]" datasets --quiet

In [2]:
# Install the DVCLive Libraries
!pip install dvc dvclive --quiet

## Download and explore the dataset

In [3]:
# Import the datasets library
from datasets import load_dataset

In [4]:
# Download the dataset
# We will use zeroshot/twitter-financial-news-sentiment
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")

Found cached dataset csv (/Users/pupa/.cache/huggingface/datasets/zeroshot___csv/zeroshot--twitter-financial-news-sentiment-ccca0f3c622c5b67/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})

In [6]:
# Explore the dataset as a pandas dataframe
dataset["train"].to_pandas()

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0
...,...,...
9538,The Week's Gainers and Losers on the Stoxx Eur...,2
9539,Tupperware Brands among consumer gainers; Unil...,2
9540,vTv Therapeutics leads healthcare gainers; Myo...,2
9541,"WORK, XPO, PYX and AMKR among after hour movers",2


## Download a pre-trained model from Huggingface

In [7]:
# Import the pytorch and transformers library
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

# Import the DVCLiveCallback for Huggingface
from dvclive.huggingface import DVCLiveCallback

In [8]:
# Set PyTorch device
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [9]:
# Download the tokenizer and the model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Fine-tune the model on the dataset

In [10]:
# Prepare the dataset for training
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [11]:
# Import the trainer library
from transformers import Trainer, TrainingArguments

In [12]:
# Make the initial layers of the model untrainable
for param in model.base_model.parameters():
    param.requires_grad = False

In [13]:
# Set the training arguments and only train the last layer
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # log & save weights each logging_steps
    use_mps_device=True              # use Apple's Metal Performance Shaders
)

In [14]:
# Define the trainer
trainer = Trainer(
    model=model,                                      # the instantiated 🤗 Transformers model to be trained
    args=training_args,                               # training arguments, defined above
    train_dataset=tokenized_dataset["train"],         # training dataset
    eval_dataset=tokenized_dataset["validation"],     # evaluation dataset
    data_collator=data_collator,                      # data collator
    tokenizer=tokenizer,                              # tokenizer
    callbacks=[DVCLiveCallback(save_dvc_exp=True)],   # DVC callback
)

In [15]:
# Train the model
trainer.train()



  0%|          | 0/897 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.1985, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 1.1863, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.07}
{'loss': 1.1725, 'learning_rate': 3e-06, 'epoch': 0.1}
{'loss': 1.1639, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.13}
{'loss': 1.1571, 'learning_rate': 5e-06, 'epoch': 0.17}
{'loss': 1.1254, 'learning_rate': 6e-06, 'epoch': 0.2}
{'loss': 1.0804, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.23}
{'loss': 1.0412, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.27}
{'loss': 1.0049, 'learning_rate': 9e-06, 'epoch': 0.3}
{'loss': 0.9778, 'learning_rate': 1e-05, 'epoch': 0.33}
{'loss': 0.9651, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.37}
{'loss': 0.9437, 'learning_rate': 1.2e-05, 'epoch': 0.4}
{'loss': 0.9115, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.43}
{'loss': 0.8695, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.47}
{'loss': 0.8523, 'learning_rate': 1.5e-05, 'epoch': 0.5}
{'loss': 0.797, 'learning_ra

	experiment-tracking-with-dvc.ipynb, experiment-tracking-old.ipynb, results/checkpoint-500/rng_state.pth, results/checkpoint-500/tokenizer_config.json, results/checkpoint-500/special_tokens_map.json, results/checkpoint-500/optimizer.pt, results/checkpoint-500/config.json, results/checkpoint-500/scheduler.pt, results/checkpoint-500/tokenizer.json, results/checkpoint-500/training_args.bin, results/checkpoint-500/vocab.txt, results/checkpoint-500/pytorch_model.bin, results/checkpoint-500/trainer_state.json
	experiment-tracking-with-dvc.ipynb, experiment-tracking-old.ipynb, results/checkpoint-500/rng_state.pth, results/checkpoint-500/tokenizer_config.json, results/checkpoint-500/special_tokens_map.json, results/checkpoint-500/optimizer.pt, results/checkpoint-500/config.json, results/checkpoint-500/scheduler.pt, results/checkpoint-500/tokenizer.json, results/checkpoint-500/training_args.bin, results/checkpoint-500/vocab.txt, results/checkpoint-500/pytorch_model.bin, results/checkpoint-500/t

TrainOutput(global_step=897, training_loss=0.736845396035491, metrics={'train_runtime': 167.5821, 'train_samples_per_second': 170.836, 'train_steps_per_second': 5.353, 'train_loss': 0.736845396035491, 'epoch': 3.0})

In [16]:
# Test on user sample
sample = "BITCOIN is going to the moon"
inputs = tokenizer(sample, return_tensors="pt")

# Get the prediction
print(model(**inputs.to(device))[0])

tensor([[-1.9960,  0.0771,  2.4874]], device='mps:0',
       grad_fn=<LinearBackward0>)


In [17]:
# This means that the model is predicting positive sentiment for this sample

## Changing the model

In [18]:
# Download the tokenizer and the model
checkpoint = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
# Define the new trainer
trainer = Trainer(
    model=model,                                      # the instantiated 🤗 Transformers model to be trained
    args=training_args,                               # training arguments, defined above
    train_dataset=tokenized_dataset["train"],         # training dataset
    eval_dataset=tokenized_dataset["validation"],     # evaluation dataset
    data_collator=data_collator,                      # data collator
    tokenizer=tokenizer,                              # tokenizer
    callbacks=[DVCLiveCallback(save_dvc_exp=True)],   # DVC callback
)

In [20]:
# Train the model
trainer.train()



  0%|          | 0/897 [00:00<?, ?it/s]

{'loss': 1.3807, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 1.3822, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.07}
{'loss': 1.2508, 'learning_rate': 3e-06, 'epoch': 0.1}
{'loss': 1.0582, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.13}
{'loss': 0.8108, 'learning_rate': 5e-06, 'epoch': 0.17}
{'loss': 0.8294, 'learning_rate': 6e-06, 'epoch': 0.2}
{'loss': 0.7439, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.23}
{'loss': 0.6839, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.27}
{'loss': 0.6834, 'learning_rate': 9e-06, 'epoch': 0.3}
{'loss': 0.6931, 'learning_rate': 1e-05, 'epoch': 0.33}
{'loss': 0.6936, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.37}
{'loss': 0.7056, 'learning_rate': 1.2e-05, 'epoch': 0.4}
{'loss': 0.6868, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.43}
{'loss': 0.6758, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.47}
{'loss': 0.6426, 'learning_rate': 1.5e-05, 'epoch': 0.5}
{'loss': 0.5865, 'learning_r

	experiment-tracking-with-dvc.ipynb, experiment-tracking-old.ipynb, results/checkpoint-500/rng_state.pth, results/checkpoint-500/tokenizer_config.json, results/checkpoint-500/special_tokens_map.json, results/checkpoint-500/optimizer.pt, results/checkpoint-500/config.json, results/checkpoint-500/scheduler.pt, results/checkpoint-500/tokenizer.json, results/checkpoint-500/training_args.bin, results/checkpoint-500/vocab.txt, results/checkpoint-500/pytorch_model.bin, results/checkpoint-500/trainer_state.json
	experiment-tracking-with-dvc.ipynb, experiment-tracking-old.ipynb, results/checkpoint-500/rng_state.pth, results/checkpoint-500/tokenizer_config.json, results/checkpoint-500/special_tokens_map.json, results/checkpoint-500/optimizer.pt, results/checkpoint-500/config.json, results/checkpoint-500/scheduler.pt, results/checkpoint-500/tokenizer.json, results/checkpoint-500/training_args.bin, results/checkpoint-500/vocab.txt, results/checkpoint-500/pytorch_model.bin, results/checkpoint-500/t

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


TrainOutput(global_step=897, training_loss=0.4050648745352874, metrics={'train_runtime': 1186.6768, 'train_samples_per_second': 24.125, 'train_steps_per_second': 0.756, 'train_loss': 0.4050648745352874, 'epoch': 3.0})