# BERT Sentiment Analysis

For the labeling of the news dataset we have used BERT model called $\textit{bert-base-multilingual-uncased-sentiment}$ that is fine-tuned for sentiment-analysis.
Note: we had to use google colab to run this code as it took over 8 GB of RAM and no team member has enough computing power.

In [None]:
# Models imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from tqdm import tqdm
import joblib
import pandas as pd

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  # predicts 1 to 5 stars
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

data = joblib.load("data/Data Files/News_dataset.joblib")

python(59817) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [None]:
tqdm.pandas()
def classify_sentiment(text):
    """
    Classifies the sentiment of the input text using a pre-trained model.

    Args:
        text (str): The input text to classify.

    Returns:
        int: Predicted sentiment rating as an integer (e.g., 1 to 5 stars).
    """
    # We start by tokenizing the input text and convert to tensor 
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # We choose the device on which to run the code, we do this so we can run on the colab GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Disable gradient calculation for inference
    with torch.no_grad():
        # Run the model and get raw logits
        outputs = model(**inputs)
        
        # Convert logits to probabilities using softmax
        probs = F.softmax(outputs.logits, dim=-1)
        
        # Select the class with the highest probability and adjust to 1-indexed star rating
        stars = torch.argmax(probs).item() + 1

    return stars

In [None]:
data['sentiment'] = data['article'].progress_apply(classify_sentiment)

aggregate = data.drop(columns=['description','title','Unnamed: 0', 'url', 'source', 'article'])
aggregate['date'] = pd.to_datetime(aggregate['date']).dt.date

# compute a "mean sentiment" for each date and load it to a .csv
sentiment_table = aggregate.groupby('date')['sentiment'].mean()
sentiment_table.to_csv('data/Data Files/sentiment_data.csv')