# Importing Required Libraries

In [1]:
# Libraries for data collection and manipulation
import pandas as pd
#import numpy as np
import datetime as dt
import yfinance as yf

# Libraries for Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)


# Libraries for Time Series Analysis
from prophet import Prophet

# Importing helper functions
import sys
sys.path.insert(1, 'C:/Users/Woon/Desktop/Columbia/Applied Analytics/Term3/Sentiment_SNP')
import helper_functions
from helper_functions import *


# Libraries for NLP
from collections import defaultdict
from transformers import pipeline



# Libraries for Webscraping
from urllib.request import Request, urlopen
from urllib.error import HTTPError
import socket
import urllib


# Environment settings: 
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



# Data Collection

In [2]:
text_input = input("Type in the Ticker: ")
print("You entered:", text_input)

You entered: xom


## Scrape Yahoo Finance News

In [3]:
now = dt.datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Report Date = ", dt.datetime.today())
print("Report Time =", current_time)
YAHOO_NEWS_URL = f'https://finance.yahoo.com/quote/{text_input}?p={text_input}&.tsrc=fin-srch'
news_df = scrape_yahoo_news(YAHOO_NEWS_URL)

Report Date =  2023-04-26 17:07:33.753845
Report Time = 17:07:33


# Feature Engineering

In [8]:
# Conduct sentiment analysis of the headline and add it as a column
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

sentiment = [None] * len(news_df)
sentiment_score = [None] * len(news_df)
index = -1
for sentence in news_df['headline']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
news_df['Sentiment'] = sentiment
news_df['Score'] = sentiment_score

# Conduct sentiment analysis of the abstract and add it as a column
sentiment = [None] * len(news_df)
sentiment_score = [None] * len(news_df)
index = -1
for sentence in news_df['content']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
news_df['Sentiment2'] = sentiment
news_df['Score2'] = sentiment_score

In [None]:
#news_df[news_df['Score'] > 0.8] 
news_df2 = news_df[news_df['Score'] > 0.8] [news_df[news_df['Score'] > 0.8] ['Score2'] > 0.8].reset_index()

In [None]:
news_df2.head(20)

In [None]:
text_input2 = input("How many rows of news?: ")
text_input2 = int(text_input2)

# Sentiment

In [None]:
sns.histplot(x = 'Sentiment', data = news_df2[:text_input2], hue = 'Sentiment', hue_order=['POSITIVE', 'NEGATIVE'])

In [None]:
sns.histplot(x = 'Sentiment', data = news_df2, hue = 'Sentiment', hue_order=['POSITIVE', 'NEGATIVE'])

# Headline Trigram

In [None]:
## Get the bar chart from 5 rated reviews ##
freq_dict = defaultdict(int)
for sent in news_df["headline"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(20), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04,
                          subplot_titles=["Frequent words"])
fig.append_trace(trace0, 1, 1)
#fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=600, width=600, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots');

# Content Trigram

In [None]:
## Get the bar chart from 5 rated reviews ##
freq_dict = defaultdict(int)
for sent in news_df["content"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(20), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04,
                          subplot_titles=["Frequent words"])
fig.append_trace(trace0, 1, 1)
#fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=600, width=600, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots');

# Finviz

In [None]:
url = f"https://finviz.com/quote.ashx?t={text_input}&ty=c&ta=1&p=d"


headers = {"User-Agent": "Mozilla/5.0"}
cookies = {"CONSENT": "YES+cb.20210720-07-p0.en+FX+410"}

req = Request(url, headers=headers)

try:
    contents = urlopen(req).read() 
    soup = BeautifulSoup(contents, features="html.parser")

    sentence2 = []

    for tag in soup.find_all('a'):

        sentence = tag.text.split(".")
        sentence2.append(sentence)

except urllib.error.HTTPError as err:
    print(err.code)

except socket.timeout as se:
    print("socket timeout")

In [None]:
df = pd.DataFrame(sentence2)

In [None]:
df['Headline'] = df[0]
df['Sentiment'] = -2
df = pd.DataFrame(list(zip(df['Headline'], df['Sentiment'])), columns=['Headline', 'Sentiment'])

In [None]:
i = []

for string in df['Headline']:
    i.append(count_words(string))

df['word_count'] = i

In [None]:
sentiment = [None] * len(df)
sentiment_score = [None] * len(df)
index = -1
for sentence in df['Headline']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
df['Sentiment'] = sentiment
df['Score'] = sentiment_score

In [None]:
df = df[df['word_count'] > 10]
df['Headline_Lower'] = df['Headline'].str.lower()

In [None]:
sns.histplot(x = 'Sentiment', data = df[df['Score'] > 0.8], hue = 'Sentiment', hue_order=['POSITIVE', 'NEGATIVE'])

In [None]:
## Get the bar chart from 5 rated reviews ##
freq_dict = defaultdict(int)
for sent in df["Headline_Lower"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(10), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04,
                          subplot_titles=["Frequent words"])
fig.append_trace(trace0, 1, 1)
#fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=600, width=600, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots');

# Forecast

In [None]:
#Past 3 year data
spy_ohlc_df = yf.download(text_input, start=dt.datetime.today() - dt.timedelta(days=1095), end=dt.datetime.today())
df=spy_ohlc_df.reset_index()

In [None]:
start_time = pd.Timestamp(df['Date'][0])
end_time = pd.Timestamp(df['Date'][len(df)-1])
time_range = pd.date_range(start_time, end_time, freq=df['Date'][1] - df['Date'][0] )

time_list = time_range.tolist()


time_list = pd.to_datetime(time_list)
index = range(len(time_list))

df_dates = pd.DataFrame(index, columns = ['Date'])
df_dates['Date'] = time_list

#Ensure all reading time exists, fill with 0 for missing reading times
df = pd.merge(df, df_dates, on='Date', how='right')

df.fillna(method='ffill', inplace=True)

In [None]:
# fig, ax = plt.subplots()
# fig.set_size_inches(15,6)
# sns.lineplot(x = 'Date', y = 'Close', data = df)
# plt.show()

In [None]:
# Load SPY price data
spy_data = df
spy_data = df[['Date', 'Close']]
spy_data.columns = ['ds', 'y']

# Initialize Prophet model
model = Prophet()

# Fit the model on SPY data
model.fit(spy_data)

# Create future dates for forecasting
future_dates = model.make_future_dataframe(periods=365, include_history=False)

# Make predictions for future dates
forecast = model.predict(future_dates)

In [None]:
# fig = model.plot(forecast, xlabel='Date', ylabel='SPY Price')
# fig.set_size_inches(15,6)
# plt.title('Forecasted Price')
# plt.xlabel('dt')
# plt.ylabel('Price')
# plt.show()

# One Year Outlook

In [None]:
# Plot the forecasted data
model.plot(forecast, xlabel='Date', ylabel='SPY Price')
plt.xlim(dt.datetime.today() - dt.timedelta(days=730), dt.datetime.today() + dt.timedelta(days=365))
plt.ylim(forecast['yhat'].mean()*0.75, forecast['yhat'].mean()*1.25)

# Plot the forecasted trend and seasonality components
fig = model.plot_components(forecast)
plt.show()

In [None]:
#Past 1 year data
spy_ohlc_df = yf.download(text_input, start=dt.date.today() - dt.timedelta(days=365), end=dt.date.today())
df1=spy_ohlc_df.reset_index()

# spy_ohlc_df = yf.download(text_input, start=dt.date.today() - dt.timedelta(days=7), end=dt.date.today())
# df1=spy_ohlc_df.reset_index()

In [None]:
start_time = pd.Timestamp(df1['Date'][0])
end_time = pd.Timestamp(df1['Date'][len(df1)-1])
time_range = pd.date_range(start_time, end_time, freq=df1['Date'][1] - df1['Date'][0] )

time_list = time_range.tolist()


time_list = pd.to_datetime(time_list)
index = range(len(time_list))

df_dates = pd.DataFrame(index, columns = ['Date'])
df_dates['Date'] = time_list

#Ensure all reading time exists, fill with 0 for missing reading times
df1 = pd.merge(df1, df_dates, on='Date', how='right')

df1.fillna(method='ffill', inplace=True)

In [None]:
# fig, ax = plt.subplots()
# fig.set_size_inches(15,6)
# sns.lineplot(x = 'Datetime', y = 'Close', data = df1)
# plt.show()

In [None]:
# Load SPY price data
spy_data = df1
spy_data = df1[['Date', 'Close']]
#spy_data = df1[['Date', 'Close']]
spy_data.columns = ['ds', 'y']

# Initialize Prophet model
model = Prophet()

# Fit the model on SPY data
model.fit(spy_data)

# Create future dates for forecasting
future_dates = model.make_future_dataframe(periods=30, include_history=False)

# Make predictions for future dates
forecast = model.predict(future_dates)


# 30 Days Outlook

In [None]:
# Plot the forecasted data
model.plot(forecast, xlabel='Date', ylabel='SPY Price')
plt.xlim(dt.date.today() - dt.timedelta(days=60), dt.date.today() + dt.timedelta(days=30))
plt.ylim(forecast['yhat'].mean()*0.9, forecast['yhat'].mean()*1.1)

# Plot the forecasted trend and seasonality components
fig = model.plot_components(forecast)
plt.show()

In [None]:
#Past week data
spy_ohlc_df = yf.download(text_input, start=dt.date.today() - dt.timedelta(days=7), end=dt.date.today(), interval='1m')
df1=spy_ohlc_df.reset_index()

# spy_ohlc_df = yf.download(text_input, start=dt.date.today() - dt.timedelta(days=7), end=dt.date.today())
# df1=spy_ohlc_df.reset_index()

In [None]:
start_time = pd.Timestamp(df1['Datetime'][0])
end_time = pd.Timestamp(df1['Datetime'][len(df1)-1])
time_range = pd.date_range(start_time, end_time, freq=df1['Datetime'][1] - df1['Datetime'][0] )

time_list = time_range.tolist()

In [None]:
time_list = pd.to_datetime(time_list)
index = range(len(time_list))

df_dates = pd.DataFrame(index, columns = ['Datetime'])
df_dates['Datetime'] = time_list

In [None]:
#Ensure all reading time exists, fill with 0 for missing reading times
df1 = pd.merge(df1, df_dates, on='Datetime', how='right')

In [None]:
df1.fillna(method='ffill', inplace=True)

In [None]:
# fig, ax = plt.subplots()
# fig.set_size_inches(15,6)
# sns.lineplot(x = 'Datetime', y = 'Close', data = df1)
# plt.show()

In [None]:
# Load SPY price data
spy_data = df1
spy_data = df1[['Datetime', 'Close']]
#spy_data = df1[['Date', 'Close']]
spy_data.columns = ['ds', 'y']

# Initialize Prophet model
model = Prophet()

# Fit the model on SPY data
model.fit(spy_data)

# Create future dates for forecasting
future_dates = model.make_future_dataframe(periods=2880, freq='T', include_history=False)


# Make predictions for future dates
forecast = model.predict(future_dates)


In [None]:
# fig = model.plot(forecast, xlabel='Date', ylabel='SPY Price')
# fig.set_size_inches(15,6)
# plt.title('Forecasted Price')
# plt.xlabel('dt')
# plt.ylabel('Price')
# plt.show()

# One Day Outlook

In [None]:
forecast.head()

In [None]:
# Plot the forecasted data
model.plot(forecast, xlabel='Date', ylabel='SPY Price')
plt.xlim(dt.date.today() - dt.timedelta(days=2), dt.date.today() + dt.timedelta(days=1))
plt.ylim(forecast['yhat'].mean()*0.98, forecast['yhat'].mean()*1.02)

# Plot the forecasted trend and seasonality components
fig = model.plot_components(forecast)
plt.show()