In [43]:
# Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [44]:
# Set file names based on which files you will be working with

# In this case we are using reddit news as our text sentiment source
newsFile = "RedditNews.csv"

# We are using the S&P 500 index fund historical prices to predict new values
stocksFile = "S&P 500 Historical Data.csv"

# Load News Dataset and Convert Text Data to Sentiment Values

In [45]:
# load news dataset
news = pd.read_csv(newsFile)

# print it out
display(news)

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...
...,...,...
73603,2008-06-08,b'Man goes berzerk in Akihabara and stabs ever...
73604,2008-06-08,b'Threat of world AIDS pandemic among heterose...
73605,2008-06-08,b'Angst in Ankara: Turkey Steers into a Danger...
73606,2008-06-08,"b""UK: Identity cards 'could be used to spy on ..."


In [46]:
# save only first five news headlines for any given date
currDate = None
articleNum = 0

dropRows = []

for i, row in enumerate(news.values):
    if row[0] != currDate:
        currDate = row[0]
        articleNum = 0
    if articleNum >= 5:
        dropRows.append(i)
    articleNum += 1


news = news.drop(dropRows)
news.head(20)

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...
25,2016-06-30,Jamaica proposes marijuana dispensers for tour...
26,2016-06-30,Stephen Hawking says pollution and 'stupidity'...
27,2016-06-30,Boris Johnson says he will not run for Tory pa...
28,2016-06-30,Six gay men in Ivory Coast were abused and for...
29,2016-06-30,Switzerland denies citizenship to Muslim immig...


In [47]:
# ensure every date entry is in pandas datetime format
news["Date"] = pd.to_datetime(news["Date"])

# show new dataframe
display(news)

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...
...,...,...
73583,2008-06-08,b'Nim Chimpsky: The tragedy of the chimp who t...
73584,2008-06-08,"b""Canada: Beware slippery slope' to censorship..."
73585,2008-06-08,b'EU Vice-President Luisa Morgantini and the I...
73586,2008-06-08,"b""Israeli minister: Israel will attack Iran if..."


In [48]:
# download vader lexicon for sentiment analysis
nltk.download('vader_lexicon')

# initialize vader model to determine sentiment
sentiment_analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [49]:
# positive example text
pos_text = "I love this product! Its amazing!"

# negative example text
neg_text = "I am doing well."

# get sentiment scores
pos_sentiment_score = sentiment_analyzer.polarity_scores(pos_text)
neg_sentiment_score = sentiment_analyzer.polarity_scores(neg_text)

# show sentiments
print(f'Score for {pos_text}: {pos_sentiment_score}')
print(f'Score for {neg_text}: {neg_sentiment_score}')

Score for I love this product! Its amazing!: {'neg': 0.0, 'neu': 0.259, 'pos': 0.741, 'compound': 0.8619}
Score for I am doing well.: {'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'compound': 0.2732}


In [50]:
# Define function to get compound sentiment score
def getSentiment(row):
    return sentiment_analyzer.polarity_scores(row['News'])['compound']

# Save and replace each headline with its sentiment
news["News"] = news.apply(getSentiment, axis=1)

# print updated dataframe
display(news)

Unnamed: 0,Date,News
0,2016-07-01,-0.5574
1,2016-07-01,-0.0516
2,2016-07-01,0.5719
3,2016-07-01,-0.8658
4,2016-07-01,-0.2960
...,...,...
73583,2008-06-08,-0.6597
73584,2008-06-08,0.0000
73585,2008-06-08,-0.4019
73586,2008-06-08,-0.1764


In [53]:
# Normalize sentiment scores

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to your data and transform the 'News' column
news['Normalized_Scores'] = scaler.fit_transform(news[['News']])

# update name of news column to scores
news.rename(columns={'News': 'Scores'}, inplace=True)

# Display updated normalized scores
display(news)

Unnamed: 0,Date,Scores,Normalized_Scores
0,2016-07-01,-0.5574,0.219861
1,2016-07-01,-0.0516,0.477043
2,2016-07-01,0.5719,0.794071
3,2016-07-01,-0.8658,0.063050
4,2016-07-01,-0.2960,0.352774
...,...,...,...
73583,2008-06-08,-0.6597,0.167845
73584,2008-06-08,0.0000,0.503280
73585,2008-06-08,-0.4019,0.298927
73586,2008-06-08,-0.1764,0.413586


In [54]:
# save updated dataset
news.to_csv("news_sentiment_data.csv", index=False)