# Sentiment Analysis using cardiffnlp/twitter-roberta-base-sentiment-latest

In [1]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
sentiment_task("Covid cases are increasing fast!")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'negative', 'score': 0.7235770225524902}]

In [13]:
sentiment_task("SocialMediaFeed9@DigitalDaily: Nvidia's stock feels the heat as expiry of essential patents looms. Will innovation cool the stock? #IPexpiry #InvestorAlert")

[{'label': 'neutral', 'score': 0.7895824909210205}]

In [15]:
import pandas as pd

In [16]:
data = pd.read_csv("training.csv")

In [17]:
data.head(5)

Unnamed: 0,SocialMediaFeed,NVDA,ING,SAN,PFE,CSCO
0,@PharmaNews: Pfizer faces backlash over possib...,0.0,0.0,0.0,-0.029512,0.0
1,@BusinessReport: A recent study found that mos...,0.0,0.0,0.0,0.0,0.0
2,@HardwareHubs: NVIDIA's contributions to a maj...,0.026125,0.0,0.0,0.0,0.0
3,@HealthWatch: Johnson & Johnson faces lawsuits...,0.0,0.0,0.0,0.0,0.0
4,@IndustryInsider: Magnificent Honary faces pro...,0.0,0.0,0.0,0.0,0.0


In [21]:
sentiments = [sentiment_task(x) for x in data["SocialMediaFeed"]]

In [27]:
for i in range(10):
    print(data["SocialMediaFeed"][i], sentiments[i])

@PharmaNews: Pfizer faces backlash over possible closure of regional office. #PharmaNews #RegionalOffice [{'label': 'negative', 'score': 0.5853261351585388}]
@BusinessReport: A recent study found that most CEOs only read business books. That explains a lot. #CEOReads #BusinessBooks [{'label': 'neutral', 'score': 0.6953707933425903}]
@HardwareHubs: NVIDIA's contributions to a major industry collaboration have given the stock a boost. #IndustryCollaboration #GraphicsChip [{'label': 'positive', 'score': 0.9469926953315735}]
@HealthWatch: Johnson & Johnson faces lawsuits over product safety concerns. #Lawsuits #ProductSafety [{'label': 'negative', 'score': 0.5267253518104553}]
@IndustryInsider: Magnificent Honary faces production delays. #ProductionDelays #IndustryNews [{'label': 'negative', 'score': 0.7787243127822876}]
@SocialMediaRumor: Unverified sources hint at Facebook's possible data breach. #DataBreach [{'label': 'negative', 'score': 0.625958263874054}]
@USFastFoodNews: McDonald's 

# Classification of Tweets

In [93]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

df = pd.DataFrame(data)

In [94]:
# Derive the 'label' column
def get_label(row):
    for col in ['NVDA', 'ING', 'SAN', 'PFE', 'CSCO']:
        if row[col] != 0.0:
            return col
    return 'None'

df['label'] = df.apply(get_label, axis=1)

In [95]:
# Encode labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

In [96]:
# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /home/ec2-user/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /home/ec2-user/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/ec2-user/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005b

In [98]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['SocialMediaFeed'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


  0%|          | 0/2 [00:00<?, ?ba/s]

In [99]:
# Convert tokenized dataset to pandas DataFrame
tokenized_df = tokenized_datasets.to_pandas()

# Keep only necessary columns
tokenized_df = tokenized_df[['input_ids', 'attention_mask', 'encoded_label']]

# Rename 'encoded_label' to 'labels' so that the Trainer recognizes it as the label column
tokenized_df = tokenized_df.rename(columns={'encoded_label': 'labels'})

# Split the DataFrame into train and test
train_df, test_df = train_test_split(tokenized_df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [100]:
# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/ec2-user/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers":

In [101]:
# Define training arguments and initialize Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    output_dir='./results',
    overwrite_output_dir=True,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [102]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(-1) == p.label_ids).mean()}  # Optional: compute accuracy
)

# Train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 945
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 357


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3898,0.643219,0.772152
2,0.5043,0.438393,0.839662
3,0.1952,0.458809,0.852321


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 237
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-119
Configuration saved in ./results/checkpoint-119/config.json
Model weights saved in ./results/checkpoint-119/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 237
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-238
Configuration saved in ./results/checkpoint-238/config.json
Mo

TrainOutput(global_step=357, training_loss=0.5400188894164997, metrics={'train_runtime': 1319.0449, 'train_samples_per_second': 2.149, 'train_steps_per_second': 0.271, 'total_flos': 186486657799680.0, 'train_loss': 0.5400188894164997, 'epoch': 3.0})

In [103]:
# Evaluate the model
results = trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 237
  Batch size = 8


In [104]:
print(results)

{'eval_loss': 0.4588092565536499, 'eval_accuracy': 0.8523206751054853, 'eval_runtime': 24.9199, 'eval_samples_per_second': 9.51, 'eval_steps_per_second': 1.204, 'epoch': 3.0}


In [109]:
def predict(tweet, model, tokenizer, label_encoder):
    # Tokenize the tweet
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=128, padding="max_length")

    # Get model's prediction
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted label ID
    predicted_label_id = torch.argmax(logits, dim=1).item()

    # Decode the label ID to get the company name
    predicted_label = label_encoder.inverse_transform([predicted_label_id])[0]

    return predicted_label

# Example usage
tweet = "SocialMediaFeed15@TechBuzz: Unverified reports suggest can achive their sustainibility goals "
predicted_company = predict(tweet, model, tokenizer, label_encoder)
print(f"The tweet is about: {predicted_company}")


The tweet is about: None
