In [1]:
import pymongo as pm
import requests
import pandas as pd 
pd.options.plotting.backend = "plotly"
# yahoo_fin documentation: https://theautomatic.net/yahoo_fin-documentation/
from yahoo_fin import stock_info as si 
from pandas_datareader import DataReader
import numpy as np

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import plotly.graph_objects as go

import nltk

from datetime import date
from newspaper import Article

  from pandas.util.testing import assert_frame_equal


In [44]:
# download language model
# !python -m spacy download en_core_web_sm

# need to do once for vader to work
nltk.download('vader_lexicon')
# need to do once for newspaper to work
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yegor/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yegor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Getting the data

In [3]:
import plotly.io as pio
# check available plotly renderers
pio.renderers

# set pandas plotting backend to plotly
pd.options.plotting.backend = "plotly"

# options for plotly to work in the notebook
pio.renderers.default = 'notebook'

In [4]:
def plot_ticker_data(ticker_data, xaxis_rangeslider_visible=True):
    """
    Plot candelstick
    """
    
    fig = go.Figure(data=[go.Candlestick(
        x=ticker_data.index,
        open=ticker_data.open,
        high=ticker_data.high,
        low=ticker_data.low,
        close=ticker_data.close,
#         increasing_line_color= 'blue', 
#         decreasing_line_color= 'red'
    )])

    # uncomment below to remove rangeslider
    fig.update_layout(xaxis_rangeslider_visible=xaxis_rangeslider_visible)

#     fig.show()
    return fig

In [None]:
ticker = 'MSFT'
start_date = '01/01/2015'
end_date = '05/31/2020'
ticker_data = si.get_data(ticker, start_date=start_date, end_date=end_date)
plot_ticker_data(ticker_data)

In [5]:
# Start MongoDB
!brew services start mongodb-community@4.2

Service `mongodb-community` already started, use `brew services restart mongodb-community` to restart.


In [None]:
# Stop MongoDB
!brew services stop mongodb-community@4.2

In [6]:
def get_recommendation(ticker):
    """
    Obtaines yahoo recommendations for a 'ticker'
    """
    lhs_url = 'https://query2.finance.yahoo.com/v10/finance/quoteSummary/'
    rhs_url = '?formatted=true&crumb=swg7qs5y9UP&lang=en-US&region=US&' \
              'modules=upgradeDowngradeHistory,recommendationTrend,' \
              'financialData,earningsHistory,earningsTrend,industryTrend&' \
              'corsDomain=finance.yahoo.com'
              
    url =  lhs_url + ticker + rhs_url
    r = requests.get(url)
    if not r.ok:
        recommendation = 6
    try:
        result = r.json()['quoteSummary']['result'][0]
        recommendation =result['financialData']['recommendationMean']['fmt']
    except:
        recommendation = 6
    
    return recommendation

In [7]:
def get_and_store_recommendations(ticker, dt=None):
    """
    Retrieves yahoo recommendations for a 'ticker' and stores to MongoDB avoiding duplicates
    """
    client = pm.MongoClient('mongodb://localhost:27017')
    collection = client['news']['recommendations']
    
    day = date.today().strftime('%Y-%m-%d') if dt is None else dt

    doc = {
        'recommendations': {
            'date' : day,
            'recommendation' : get_recommendation(ticker)
        }
    }
    
    collection.update_one(
        {'ticker' : ticker},
        {'$addToSet': doc},
        upsert = True
    )
    print(f"Saved {ticker}: {doc}")
    return doc

In [None]:
# the multiprocessing doesn't work in Jupyter, so need to launch from the console
# takes about 5 hours
! python scrape_news.py

In [8]:
# get the SP500 list
tickers = si.tickers_sp500()

In [9]:
# retrieve and store current recommendations
for ticker in tickers: 
#     print(get_and_store_recommendations(ticker))
    get_and_store_recommendations(ticker)

Saved A: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.20'}}
Saved AAL: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.30'}}
Saved AAP: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.30'}}
Saved AAPL: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.00'}}
Saved ABBV: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.10'}}
Saved ABC: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.40'}}
Saved ABMD: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.30'}}
Saved ABT: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.90'}}
Saved ACN: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.10'}}
Saved ADBE: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.00'}}
Saved ADI: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.00'}}
Saved ADM: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.10'}}
Saved ADP: {'recommendatio

Saved CMI: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.70'}}
Saved CMS: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.40'}}
Saved CNC: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.70'}}
Saved CNP: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.50'}}
Saved COF: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.10'}}
Saved COG: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.50'}}
Saved COO: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.00'}}
Saved COP: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.90'}}
Saved COST: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.20'}}
Saved COTY: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.00'}}
Saved CPB: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.10'}}
Saved CPRT: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.90'}}
Saved CRM: {'recommendati

Saved GOOGL: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.80'}}
Saved GPC: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.80'}}
Saved GPN: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.90'}}
Saved GPS: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.20'}}
Saved GRMN: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.80'}}
Saved GS: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.20'}}
Saved GWW: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.70'}}
Saved HAL: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.70'}}
Saved HAS: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.10'}}
Saved HBAN: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.70'}}
Saved HBI: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.80'}}
Saved HCA: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.80'}}
Saved HD: {'recommendatio

Saved MHK: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.80'}}
Saved MKC: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.40'}}
Saved MKTX: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.80'}}
Saved MLM: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.20'}}
Saved MMC: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.60'}}
Saved MMM: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.20'}}
Saved MNST: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.20'}}
Saved MO: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.10'}}
Saved MOS: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.50'}}
Saved MPC: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.90'}}
Saved MRK: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.90'}}
Saved MRO: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.90'}}
Saved MS: {'recommendations

Saved SEE: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.40'}}
Saved SHW: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.30'}}
Saved SIVB: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.30'}}
Saved SJM: {'recommendations': {'date': '2020-06-04', 'recommendation': '3.20'}}
Saved SLB: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.30'}}
Saved SLG: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.80'}}
Saved SNA: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.40'}}
Saved SNPS: {'recommendations': {'date': '2020-06-04', 'recommendation': '1.80'}}
Saved SO: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.90'}}
Saved SPG: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.60'}}
Saved SPGI: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.00'}}
Saved SRE: {'recommendations': {'date': '2020-06-04', 'recommendation': '2.30'}}
Saved STE: {'recommendatio

In [2]:
client = pm.MongoClient('mongodb://localhost:27017')
collection = client['news']['recommendations']

In [11]:
# remove the news with erroneous datetime from the 'news' array
res = collection.update_many(
    { },
    { '$pull': { 'news' : { 'datetime' : '1900-01-01'}}}
)
res.raw_result

In [10]:
url = 'https://finance.yahoo.com/news/ackmans-pershing-square-exits-starbucks-180954467.html'
collection.find_one({
    'news.url' : url
})

{'_id': ObjectId('5ed818f6e8e6fce52ecdf7d9'),
 'recommendations': [{'date': '2020-06-03', 'recommendation': '2.20'},
  {'date': '2020-06-04', 'recommendation': '2.20'}],
 'ticker': 'A',
 'news': [{'datetime': 'Jun-02-20',
   'url': 'https://finance.yahoo.com/news/agilents-shares-march-higher-continue-121412368.html',
   'title': "Agilent's Shares March Higher, Can It Continue?",
   'text': 'As of late, it has definitely been a great time to be an investor in Agilent Technologies, Inc. A. The stock has moved higher by 20.6% in the past month, while it is also above its 20 Day SMA too. This combination of strong price performance and favorable technical, could suggest that the stock may be on the right path.\n\nWe certainly think that this might be the case, particularly if you consider A’s recent earnings estimate revision activity. From this look, the company’s future is quite favorable; as A has earned itself a Zacks Rank #2 (Buy), meaning that its recent run may continue for a bit lo

In [8]:
not res

True

For the purpose of this analysis, the only values that are of importance are 1–1.5, 3, and 4.5–5. These are the scores, that signify the highest chance for an event to take place, and thus are the best indicators..
only specific values are of importance (the highest indicators). As a result, I will be creating three novel dataframes, named “hold_df”, “buy_df”, and “sell_df” that will then be contacted into “new_df”.

In [None]:
hold_df = df[df.recommendation == 3]
buy_df = df[df.recommendation <= 1.5]
sell_df = df[df.recommendation >= 4.5]

df_list = [hold_df, buy_df, sell_df]
new_df = pd.concat(df_list)
new_df.reset_index(level=0, inplace=True)
new_df

It is now time to fetch the News and Twitter feed. Once both feeds are successfully fetched, sentiment analysis for each stock will be individually conducted for each platform and then the two results will be added and divided by two.
So the final sentiment score will be calculated as follows:
Final Score = (Twitter Sentiment Score + News Feed Sentiment Score) / 2

finviz is going to be used to parse the news data into a Pandas dataframe.

The problem with the data is that in their current form, they can not be used by any model. Thus, I will be grouping the headlines for each company in one string, according to the company they are referring to.

# Sentiment

In [55]:
ticker = 'AAPL'
news = collection.find_one({'ticker': ticker})['news']
vader = SentimentIntensityAnalyzer()
for n in news:
    print(n['url'])
    print(vader.polarity_scores(n['text']))
    
    collection.update_one(
        {'ticker' : ticker},
        {'$addToSet': doc},
    )
    break

https://finance.yahoo.com/news/focus-amid-pandemic-investors-bet-003000363.html
{'neg': 0.013, 'neu': 0.927, 'pos': 0.059, 'compound': 0.9838}


In [None]:
parsed_and_scored_news = parsed_and_scored_news.groupby(['ticker'], as_index = False)\
    .agg({'headline': ''.join}, Inplace=True)
parsed_and_scored_news

In [None]:
vader = SentimentIntensityAnalyzer()

scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()

scores_df = pd.DataFrame(scores)

parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')

In [None]:
parsed_and_scored_news

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
import pandas as pd

In [None]:
!ls stocks_latest

In [None]:
df = pd.read_csv('dataset_summary.csv')
df.head()

In [None]:
df = pd.read_csv('stocks_latest/stock_prices_latest.csv', parse_dates=['date'])
df.head()

In [None]:
df.info()

In [None]:
stock = ['A']
A = df[df.symbol.isin(stock)]

In [None]:
A.set_index('date', inplace=True)
A.sort_index(inplace=True)

In [None]:
A.head()

In [None]:
A.truncate(before='2015-01-01')['close'].plot(figsize=(16,12))

In [None]:
by_symbol = df.groupby(['date','symbol'])

In [None]:
by_symbol