Sentiment analysis on social media posts adapted from https://www.kaggle.com/code/robikscube/sentiment-analysis-python-youtube-tutorial/notebook. Written for the course Data Science for Smart Environments. 16-1-2024

#RoBERTa Model
A trained model that does not only focus on the sentiment of a specific word, but also the context around the word. It is based on Google’s BERT model released in 2018.


In [None]:
# Mount Google Drive to be able to import the needed datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import packages, model and data

In [5]:
#Import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import numpy as np
import nltk
from tqdm.notebook import tqdm

In [6]:
#Load required packages for the roberta model
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [7]:
# Incorporate roberta model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [8]:
#Load the data
df_facebook = pd.read_csv('/content/drive/MyDrive/SE/Smart_Environments/Data_Facebook/preprocessed_facebook_data.csv')
df_twitter = pd.read_csv('/content/drive/MyDrive/SE/Smart_Environments/Data_Twitter/preprocessed_twitter_data.csv')

# Put text column to string
df_facebook['text'] = df_facebook['text'].astype(str)
df_twitter['text'] = df_twitter['text'].astype(str)

# Rename username column twitter
#df_twitter.rename(columns={'username': 'profileId'}, inplace=True)

# Select data in facebook per news website
df_volkskrant = df_facebook[df_facebook['news_source'] == 'volkskrant']
df_nu = df_facebook[df_facebook['news_source'] == 'NU']
df_nos = df_facebook[df_facebook['news_source'] == 'NOS']

## Functions used in the sentiment analysis

In [9]:
# Function dictionaries for roberta scores
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [10]:
# Function sentiment analysis
def sentiment_roberta(df):
  res = {}
  for i, row in tqdm(df.iterrows(), total=len(df)):
      try:
          text = row['text']
          myid = row['id']
          roberta_result = polarity_scores_roberta(text)
          both = {**roberta_result}
          res[myid] = both
      except RuntimeError:
          print(f'Broke for id {myid}')
  return res

In [11]:
# Function weighted sentiment (merge 3 sentiment values)
def weighted_sentiment(df):
  # weights
  w_neg = -1
  w_neu = 0
  w_pos = 1

  # calculate total sentiment
  for i, row in tqdm(df.iterrows(), total=len(df)):
    df['total_sentiment'] = (w_neg * df['roberta_neg'] + w_neu * df['roberta_neu']
                            + w_pos * df['roberta_pos'])
  return df

In [12]:
# Function to combine previous functions and give output in df
def results_sentiment_roberta(df):
  df_run = df
  # run function for sentiment analysis
  res = sentiment_roberta(df_run)
  # transform res to dataframe
  res_df = pd.DataFrame(res).T
  res_df = res_df.reset_index().rename(columns={'index': 'id'})

  # run function total sentiment
  results_df = weighted_sentiment(res_df)

  # merge df with intitial df
  results_df = results_df.merge(df, how='left')
  return results_df

## Call functions

In [None]:
# Call the sentiment analysis as a function for each data source

results_twitter = results_sentiment_roberta(df_twitter)
#results_volkskrant = results_sentiment_roberta(df_volkskrant)
#results_nu = results_sentiment_roberta(df_nu)
#results_nos = results_sentiment_roberta(df_nos)

In [None]:
# Export output
# Make base paths
base_path = '//content/drive/MyDrive/SE/Smart_Environments/'
fac_path = base_path + 'Data_Facebook/Sentiment_Analysis/'

# Write output to .csv
results_twitter.to_csv(base_path + 'Sentiment_Twitter.csv', index=False)
results_volkskrant.to_csv(fac_path + 'Sentiment_volkskrant.csv', index=False)
results_nu.to_csv(fac_path + 'Sentiment_nu.csv', index=False)
results_nos.to_csv(fac_path + 'Sentiment_nos.csv', index=False)