## Imports and Data Sources

In [1]:
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
news_df = pd.read_csv('data/news.csv').set_index('Date')
news_df.drop(columns=['Unnamed: 0', 'Label'], inplace=True)

In [3]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [4]:
df = news_df
chunk_size = 513
num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)

chunks = []
for i in range(num_chunks):
    start_index = i * chunk_size
    end_index = min((i + 1) * chunk_size, len(df))
    chunk = df.iloc[start_index:end_index]
    chunks.append(chunk)

In [5]:
# Break into chunks for debugging
for i in range(8):
    globals()[f"df{i+1}"] = chunks[i].copy()

In [39]:
# Map sentiment scores to dataframe
def aggregate_dicts_to_mean(dict_list, verbose = False):
    sums = {key: 0.0 for key in dict_list[0].keys()}
    counts = {key: 0 for key in dict_list[0].keys()}
    
    for d in dict_list:
        for key in d.keys():
            sums[key] += d[key]
            counts[key] += 1
    
    means = {key: sums[key] / counts[key] for key in sums.keys()}
    if verbose:
        print(f"{means}")
    return means

In [40]:
# Get sentiment score from headline
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)    
    
    scores_dict = {
        'neg' : scores[0],
        'neu' : scores[1],
        'pos' : scores[2],
    }
    
    return scores_dict

In [41]:
headline_cols = 25
polarity_scores = []

# Get mean sentiment of all headlines for that day
def mean_roberta_score(row, max_iterations=headline_cols):
    scores = []
    for i in range(max_iterations):
        try:
            text = row.values[i]
            score = polarity_scores_roberta(text)
            scores.append(score)

            if i >= max_iterations-1:            
                result = aggregate_dicts_to_mean(scores)
                polarity_scores.append(result)
                scores = []
        except RuntimeError:
            print(f'Error with: {text} on row: {row} for index: {i}.')

In [42]:
# Sentiment test
text1 = 'Blah blah blah, I am so sleepy... >:('
text2 = 'I believe the weather will windy today.'
text3 = 'WHAT IS SUP EVERYONE, AWESOME!!!'

sia = SentimentIntensityAnalyzer()
sia.polarity_scores(text3)

{'neg': 0.0, 'neu': 0.446, 'pos': 0.554, 'compound': 0.7163}

In [10]:
polarity_scores_roberta(text3)

{'neg': 0.004427868, 'neu': 0.02399209, 'pos': 0.9715801}

In [11]:
# Chunk 5 missing headlines
df5[df5.isnull().any(axis=1)]

prev_day = df5.loc['2009-09-11', 'Top23']
prev_day1 = df5.loc['2009-09-14', 'Top24']
next_day = df5.loc['2009-09-16', 'Top25']

df5.loc['2009-09-15', 'Top23'] = prev_day
df5.loc['2009-09-15', 'Top24'] = prev_day1
df5.loc['2009-09-15', 'Top25'] = next_day

prev_day2 = df5.loc['2009-09-23', 'Top24']
next_day2 = df5.loc['2009-09-28', 'Top25']

df5.loc['2009-12-24', 'Top24'] = prev_day2
df5.loc['2009-12-24', 'Top25'] = next_day2

In [12]:
# Chunk 6 missing headlines
prev_day = df6.loc['2011-04-20', 'Top24']
next_day = df6.loc['2011-04-25', 'Top25']
df6.loc['2011-04-21', 'Top24'] = prev_day
df6.loc['2011-04-21', 'Top25'] = next_day

In [13]:
#1
polarity_scores = []
df1 = df1.T
df1.apply(mean_roberta_score, axis=0)

Date
2000-01-03    None
2000-01-04    None
2000-01-05    None
2000-01-06    None
2000-01-07    None
              ... 
2002-01-24    None
2002-01-25    None
2002-01-28    None
2002-01-29    None
2002-01-30    None
Length: 513, dtype: object

In [14]:
#1
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [15]:
#1
df1 = df1.T
df1['negative'] = neg_vals
df1['neutral'] = neu_vals
df1['positive'] = pos_vals

In [16]:
#2
polarity_scores = []
df2 = df2.T
df2.apply(mean_roberta_score, axis=0)

Date
2002-01-31    None
2002-02-01    None
2002-02-04    None
2002-02-05    None
2002-02-06    None
              ... 
2004-03-26    None
2004-03-29    None
2004-03-30    None
2004-03-31    None
2004-04-01    None
Length: 513, dtype: object

In [17]:
#2
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [18]:
#2
df2 = df2.T
df2['negative'] = neg_vals
df2['neutral'] = neu_vals
df2['positive'] = pos_vals

In [19]:
#3
polarity_scores = []
df3 = df3.T
df3.apply(mean_roberta_score, axis=0)

Date
2004-04-02    None
2004-04-05    None
2004-04-06    None
2004-04-07    None
2004-04-08    None
              ... 
2006-04-11    None
2006-04-12    None
2006-04-13    None
2006-04-17    None
2006-04-18    None
Length: 513, dtype: object

In [20]:
#3
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [21]:
#3
df3 = df3.T
df3['negative'] = neg_vals
df3['neutral'] = neu_vals
df3['positive'] = pos_vals

In [22]:
#4
polarity_scores = []
df4 = df4.T
df4.apply(mean_roberta_score, axis=0)

Date
2006-04-19    None
2006-04-20    None
2006-04-21    None
2006-04-24    None
2006-04-25    None
              ... 
2008-05-07    None
2008-05-08    None
2008-05-09    None
2008-05-12    None
2008-05-13    None
Length: 513, dtype: object

In [23]:
#4
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [24]:
#4
df4 = df4.T
df4['negative'] = neg_vals
df4['neutral'] = neu_vals
df4['positive'] = pos_vals

In [25]:
#5
polarity_scores = []
df5 = df5.T
df5.apply(mean_roberta_score, axis=0)

Date
2008-05-14    None
2008-05-15    None
2008-05-16    None
2008-05-19    None
2008-05-20    None
              ... 
2010-05-20    None
2010-05-21    None
2010-05-24    None
2010-05-25    None
2010-05-26    None
Length: 513, dtype: object

In [26]:
#5
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [27]:
#5
df5 = df5.T
df5['negative'] = neg_vals
df5['neutral'] = neu_vals
df5['positive'] = pos_vals

In [28]:
#6
polarity_scores = []
df6 = df6.T
df6.apply(mean_roberta_score, axis=0)

Date
2010-05-27    None
2010-05-28    None
2010-06-01    None
2010-06-02    None
2010-06-03    None
              ... 
2012-06-01    None
2012-06-04    None
2012-06-05    None
2012-06-06    None
2012-06-07    None
Length: 513, dtype: object

In [29]:
#6
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [30]:
#6
df6 = df6.T
df6['negative'] = neg_vals
df6['neutral'] = neu_vals
df6['positive'] = pos_vals

In [31]:
#7
polarity_scores = []
df7 = df7.T
df7.apply(mean_roberta_score, axis=0)

Date
2012-06-08    None
2012-06-11    None
2012-06-12    None
2012-06-13    None
2012-06-14    None
              ... 
2014-06-18    None
2014-06-19    None
2014-06-20    None
2014-06-23    None
2014-06-24    None
Length: 513, dtype: object

In [32]:
#7
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [33]:
#7
df7 = df7.T
df7['negative'] = neg_vals
df7['neutral'] = neu_vals
df7['positive'] = pos_vals

In [34]:
#8
polarity_scores = []
df8 = df8.T
df8.apply(mean_roberta_score, axis=0)

Date
2014-06-25    None
2014-06-26    None
2014-06-27    None
2014-06-30    None
2014-07-01    None
              ... 
2016-06-27    None
2016-06-28    None
2016-06-29    None
2016-06-30    None
2016-07-01    None
Length: 510, dtype: object

In [35]:
#8
neg_vals = [m['neg'] for m in polarity_scores]
neu_vals = [m['neu'] for m in polarity_scores]
pos_vals = [m['pos'] for m in polarity_scores]

In [36]:
#8
df8 = df8.T
df8['negative'] = neg_vals
df8['neutral'] = neu_vals
df8['positive'] = pos_vals

In [37]:
dfs = [df1, df2, df3, df4, df5, df6, df7, df8]
df = pd.concat(dfs, ignore_index=False)
df.shape

(4101, 28)

In [38]:
df.to_csv('data/5_news_headlines_sentiment.csv', index=True)