<a href="https://colab.research.google.com/github/veerashayyagari/llms-in-prod/blob/main/twitter_roberta_base_airline_tweets_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Perform Twitter Airline Data Sentiment Analysis

* DataSet : [dataset](https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment)
* NLP for absolute beginners : [reference](https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners)
* Model , twitter-roberta-base-sentiment: [huggingface](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment)

In [None]:
!pip install datasets transformers accelerate



In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("sample_data/kaggle_airline_sentiment_tweets.csv")

In [None]:
# checking possible unique values for sentiment
data['airline_sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [None]:
data['airline'].unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

In [None]:
# Let's limit our analysis to text and airline_sentiment columns ( keep it simple initially )
df = data[['text', 'airline_sentiment']]
df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [None]:
df['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [None]:
# try filtering out neutral sentiment
# df = df[df['airline_sentiment'] != 'neutral']
# df['airline_sentiment'].value_counts()

In [None]:
# convert sentiment to numerical labels
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
# apparently transformers expect the column to labelled 'labels' , let's rename
df['labels'] = label_enc.fit_transform(df['airline_sentiment'])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['labels'] = label_enc.fit_transform(df['airline_sentiment'])


Unnamed: 0,text,airline_sentiment,labels
0,@VirginAmerica What @dhepburn said.,neutral,1
1,@VirginAmerica plus you've added commercials t...,positive,2
2,@VirginAmerica I didn't today... Must mean I n...,neutral,1
3,@VirginAmerica it's really aggressive to blast...,negative,0
4,@VirginAmerica and it's a really big bad thing...,negative,0


In [None]:
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
df['cleaned_tweet'] = df['text'].apply(preprocess)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweet'] = df['text'].apply(preprocess)


Unnamed: 0,text,airline_sentiment,labels,cleaned_tweet
0,@VirginAmerica What @dhepburn said.,neutral,1,@user What @user said.
1,@VirginAmerica plus you've added commercials t...,positive,2,@user plus you've added commercials to the exp...
2,@VirginAmerica I didn't today... Must mean I n...,neutral,1,@user I didn't today... Must mean I need to ta...
3,@VirginAmerica it's really aggressive to blast...,negative,0,@user it's really aggressive to blast obnoxiou...
4,@VirginAmerica and it's a really big bad thing...,negative,0,@user and it's a really big bad thing about it


In [None]:
# Transformers uses a Datset object for storing data
from datasets import Dataset
ds = Dataset.from_pandas(df[['cleaned_tweet', 'labels']])
ds

Dataset({
    features: ['cleaned_tweet', 'labels'],
    num_rows: 14640
})

In [None]:
! rm -rf cardiffnlp

In [None]:
# use twitter-roberta-base-sentiment model for tokenizing
from transformers import AutoTokenizer

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
# load and save the pre-trained model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [None]:
# tokenizer sample
tokenizer.tokenize(ds['cleaned_tweet'][2])

['@',
 'user',
 'ĠI',
 'Ġdidn',
 "'t",
 'Ġtoday',
 '...',
 'ĠMust',
 'Ġmean',
 'ĠI',
 'Ġneed',
 'Ġto',
 'Ġtake',
 'Ġanother',
 'Ġtrip',
 '!']

In [None]:
# defining a function to tokenize
def tokenize_tweet(x):
  return tokenizer(x['cleaned_tweet'])

In [None]:
tokenized_ds = ds.map(tokenize_tweet, batched=True)
tokenized_ds[0]

Map:   0%|          | 0/14640 [00:00<?, ? examples/s]

{'cleaned_tweet': '@user What @user said.',
 'labels': 1,
 'input_ids': [0, 1039, 12105, 653, 787, 12105, 26, 4, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# split the dataset into train and validation features
# transfomers hold this data in a dictionary called dataset dictionary
dsd = tokenized_ds.train_test_split(0.25, seed=42) # 75-25 split
dsd

DatasetDict({
    train: Dataset({
        features: ['cleaned_tweet', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10980
    })
    test: Dataset({
        features: ['cleaned_tweet', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3660
    })
})

In [None]:
# define metrics for evaluating how good the model is
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
# training the model
from transformers import TrainingArguments, Trainer
batch_size = 128 # should be good enough for this GPU
epochs = 6
learning_rate = 4e-5

In [None]:
# define training arguments
training_args = TrainingArguments(
   output_dir="out",
   learning_rate=learning_rate,
   warmup_ratio=0.1,
   lr_scheduler_type='cosine',
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size*2,
   num_train_epochs=epochs,
   weight_decay=0.01,
   save_strategy="epoch",
   fp16=True,
   report_to='none'
)

In [None]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dsd['train'],
   eval_dataset=dsd['test'],
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train();



Step,Training Loss
500,0.034


In [None]:
trainer.evaluate()

{'eval_loss': 0.9233771562576294,
 'eval_accuracy': 0.8546448087431694,
 'eval_f1': 0.8532651309862623,
 'eval_runtime': 6.0028,
 'eval_samples_per_second': 609.713,
 'eval_steps_per_second': 2.499}