In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [65]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

In [None]:
# importing the BERT model
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
# importing our dataset stored in the data folder
df = pd.read_csv('../data/twitter_sentiment_data.csv')

In [60]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [None]:
!pip install tqdm

In [None]:
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm # this package is for a progress bar when running BERT

In [None]:
# in order to leverage the GPU we partition the data into multiple chunks so that it can be processed concurrently
class TweetDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    # these are member methods that are required to override the Dataset class to utilize the texts data
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [None]:
def predict_sentiments(texts, model, tokenizer, device, batch_size=32):
    model = model.to(device)
    model.eval()
    
    dataset = TweetDataset(texts, tokenizer)
    # split the data into batches of size 32 so that these batches can be processed concurrently using GPU
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    sentiment_scores = []
    sentiment_categories = []
    
    # category mapping - can be changed
    sentiment_map = {
        1: "Highly Negative",
        2: "Negative",
        3: "Neutral",
        4: "Positive", 
        5: "Highly Positive"
    }
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing tweets"):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
            # get raw scores (probability of most likely class)
            scores = torch.max(predictions, dim=1).values
            predicted_classes = torch.argmax(predictions, dim=1)
            
            # utilizing the cpu here so that it doesnt interefere with the main model running - this is just list comprehension
            batch_scores = scores.cpu().numpy().tolist()
            
            # mapping the probability of the class with the sentiment map array
            batch_categories = [sentiment_map[pred.item() + 1] for pred in predicted_classes]
            
            # adding the new scores for the batch to the sentiment scores and categories
            sentiment_scores.extend(batch_scores)
            sentiment_categories.extend(batch_categories)
            
    return sentiment_scores, sentiment_categories

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
scores, categories = predict_sentiments(df['message'], model, tokenizer, device)
df['sentiment_score'] = scores
df['sentiment_category'] = categories

Processing tweets: 100%|██████████| 1374/1374 [05:04<00:00,  4.51it/s]


In [76]:
df.head()

Unnamed: 0,sentiment,message,tweetid,sentiment_score,sentiment_category
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840,0.37214,Negative
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641,0.293222,Highly Negative
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256,0.882682,Highly Positive
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904,0.356755,Highly Positive
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153,0.276978,Positive


In [None]:
output_path = "../data/sentiment_data_classified.csv"  # relative path from /scripts to /data
df.to_csv(output_path, index=False)

In [None]:
# split up the messages based on the sentiment
df_positive = df[df['sentiment'] == 1]
df_negative = df[df['sentiment'] == -1]
df_news = df[df['sentiment'] == 2]

# dropping the sentiment and tweetid columns for the sub dataframes
df_positive.drop(columns=['sentiment', 'tweetid'], axis=1, inplace = True)
df_negative.drop(columns=['sentiment', 'tweetid'], axis=1, inplace = True)
df_news.drop(columns=['sentiment', 'tweetid'], axis=1, inplace = True)