<a href="https://colab.research.google.com/github/suryaR-15/twitter-sentiment-analysis/blob/main/Tweet_Sentiment_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install necessary dependencies for colab environment
! pip install datasets
! pip install torch
! pip install wordcloud
! pip install accelerate -U
! pip install transformers[torch]

In [5]:
# import packages
from datasets import Dataset, load_dataset
import pandas as pd
import torch
torch.cuda.is_available()

False

In [3]:
# Import training data
data = load_dataset('csv', data_files='train.csv', split='train')
data

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['textID', 'text', 'selected_text', 'sentiment'],
    num_rows: 27481
})

In [6]:
# convert dataset object to pandas dataframe for EDA
df = pd.DataFrame(data)
df = df.dropna()
df.head(10)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive


In [8]:
# Prepare data to train the model on selected text / tokenized text
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

if 'textID' in df.columns:
  df = df.drop(columns=['textID', 'text'])
  df = df.rename(columns={'selected_text': 'text', 'sentiment': 'label'})

id2label = {0: "negative", 1: "positive", 2: "neutral"}
label2id = {"negative": 0, "positive": 1, "neutral": 2}

df['label'] = df['label'].replace({'negative': 0, 'positive': 1, 'neutral': 2})

df_train, df_eval = train_test_split(df, test_size=0.2)
display(df_train)

dataset_train = Dataset.from_pandas(df_train)
dataset_eval = Dataset.from_pandas(df_eval)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(data):
   return tokenizer(data["text"], truncation=True)

tokenized_train = dataset_train.map(preprocess_function, batched=True)
tokenized_eval = dataset_eval.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                           num_labels=3, id2label=id2label, label2id=label2id)

Unnamed: 0,text,label
17036,"Is Watching Britains Got Talent, & Is biting h...",2
17679,I`m not happy,0
21916,my n95 hacked no signing needed.. thx OPDA,2
3829,early phonograph industry? done.. just the res...,2
15675,miss,0
...,...,...
17249,and now! ITS #starwarsday!!!! To celebrate i s...,2
22276,World`s Happiest Places,1
25202,kill,0
268,"I waited, listening to wind blowing through th...",2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/21984 [00:00<?, ? examples/s]

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# function to compute the metrics accuracy and f1-score

import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels,average='micro')["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
# log into huggingface with my access token to run training
! pip install huggingface_hub

from huggingface_hub import notebook_login
notebook_login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir='twitter-sentiment-analysis-distilbert',
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_eval,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss
500,0.3367
1000,0.3254
1500,0.3021
2000,0.2275
2500,0.2365
3000,0.1837
3500,0.1634
4000,0.1638
4500,0.1004
5000,0.0975


TrainOutput(global_step=13740, training_loss=0.10467222878679612, metrics={'train_runtime': 1042.778, 'train_samples_per_second': 210.821, 'train_steps_per_second': 13.176, 'total_flos': 1921701566054880.0, 'train_loss': 0.10467222878679612, 'epoch': 10.0})

In [None]:
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.8305980563163757,
 'eval_accuracy': 0.8882823871906841,
 'eval_f1': 0.8882823871906841,
 'eval_runtime': 7.2334,
 'eval_samples_per_second': 759.806,
 'eval_steps_per_second': 47.557,
 'epoch': 10.0}

In [None]:
trainer.push_to_hub()

events.out.tfevents.1708511530.28dca892d3fe.480.12:   0%|          | 0.00/457 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suryaR-15/twitter-sentiment-analysis-distilbert/commit/20ec8272037234e78db5c6284584a078c588a7f6', commit_message='End of training', commit_description='', oid='20ec8272037234e78db5c6284584a078c588a7f6', pr_url=None, pr_revision=None, pr_num=None)