# Clean

In [4]:
def clean_and_tokenize_text(text):
    """
    Cleans `text` from punctuation and special symbols
    """    
    # need to adjust this for better cleaning
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords

    lower_alpha_tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]
    no_stop = [t for t in lower_alpha_tokens if t not in set(stopwords.words('english'))]
    
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(t) for t in no_stop]
    return lemmatized

def quality_check(article):
    """
    Checks for quality of the `article` and returns False if it doesnt' pass the 
    QA check
    """
    
    # need to find ways for better QA
    if article and len(article) > 150: return True
    return False

def concat_news(c, ticker, period=1):
    """
    Reads all news from the `DB` for the `ticker`, 
    concats by the `period`.
    
    Input
    -----
    c     : collection, MongoDB
    ticker: str, ticker name
    period: int, default 1 (day)
    
    Output
    ------
    resulting_dictionary: dict, key–day, value–all news for that day
    Counter(all_tokens)
    """
    from collections import Counter
    
    news = c.find_one({
        'ticker' : ticker
    })['news']
    
    resulting_dictionary = {}
    
    all_tokens = []
    
    for n in news:
        try:
            date = str(pd.to_datetime(n['datetime']).date())

            tokens = clean_and_tokenize_text(n['text'])

            text = ' '.join(tokens)

            all_tokens.extend(tokens)

            if quality_check(text):
                if date in resulting_dictionary.keys():
                    resulting_dictionary[date] = resulting_dictionary[date] + text
                else:
                    resulting_dictionary[date] = text
        except Exception as e:
            print(f"Cannot process url: {n['url']}\nError: {e}")
    
    return resulting_dictionary, Counter(all_tokens)

# Prototype

Take 1 stock (AAPL), do sentiment and compare to stock price time series

In [2]:
# connect to DB
import pymongo as pm

# Start MongoDB
# !brew services start mongodb-community@4.2

# Stop MongoDB
# !brew services stop mongodb-community@4.2

client = pm.MongoClient('mongodb://localhost:27017')
c = client['news']['recommendations']

In [5]:
from yahoo_fin import stock_info as si 
import pandas as pd
from datetime import date

# create dataset for predicting pricing for a company

# set the company ticker
ticker = 'AAPL'

# set interval
start_date = '01/01/2019'
end_date = date.today()

# obtain historical stock pricing data
t_data = si.get_data(ticker, start_date=start_date, end_date=end_date)

# get the dictionary with all news per period (1 day) and BOW (bag of words)
t_news, t_bow = concat_news(c, ticker, period=1)

# convert to pandas df
t_df = pd.DataFrame.from_dict(d, orient='index', columns=['text'])
t_df.index = pd.to_datetime(t_df.index)

# combine news with stock price 'adjclose'
t_combined = pd.concat([t_df, t_data.adjclose], axis=1)

# fill adjclose for over the weekends and holiday
# logic is the price on monday close is the result of 
# the news over the weekend hence backfill
t_combined.adjclose = t_combined.adjclose.fillna(method='backfill')

# drop rows without the stock price
t_combined = t_combined[t_combined.adjclose.notna()]
t_combined.tail(20)

Cannot process url: http://www.ft.com/cms/s/9c847838-ca25-43cf-a962-0d3588d8d34d,s01=1.html?ftcamp=traffic/partner/feed_headline/us_yahoo/auddev
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.barrons.com/articles/the-stock-market-is-headed-for-a-huge-quarter-is-a-tech-bubble-in-the-making-51592948400?siteid=yhoof2
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.barrons.com/articles/mercedes-benz-nvidia-sign-deal-to-make-cars-more-like-iphones-51592933400?siteid=yhoof2
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.barrons.com/articles/apple-stock-tim-cook-keynote-wwdc-speech-51592929033?siteid=yhoof2
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.reuters.com/article/uk-goldman-sachs-probe-idUKKBN1XK00D
Error: 'datetime'
Cannot process url: https://www.marketwatch.com/story/dow-trades-at-session-low-after-fed-statement-with-less-than-a-half-hour-left-in-

NameError: name 'd' is not defined

In [None]:
t_combined.to_csv(f'to_predict/{ticker}.csv', index_label='date')

In [8]:
!pip install pyppdf
import pyppdf.patch_pyppeteer

Collecting pyppdf
  Downloading pyppdf-0.0.12.tar.gz (25 kB)
Collecting litereval>=0.0.9
  Downloading litereval-0.0.11.tar.gz (20 kB)
Building wheels for collected packages: pyppdf, litereval
  Building wheel for pyppdf (setup.py) ... [?25ldone
[?25h  Created wheel for pyppdf: filename=pyppdf-0.0.12-py3-none-any.whl size=11540 sha256=9c444c42d301f955112f32337b10a60c6fa01bc7289a329a638734014b5bc42e
  Stored in directory: /Users/yegor/Library/Caches/pip/wheels/2c/78/f3/e0f458aa03b76e800123f501f5974ef673ed4a060a15211f3c
  Building wheel for litereval (setup.py) ... [?25ldone
[?25h  Created wheel for litereval: filename=litereval-0.0.11-py3-none-any.whl size=6014 sha256=cbd080a5ea877468b73739bc98090127cdf0fc0647c5b99aaa5c949ad02b62fa
  Stored in directory: /Users/yegor/Library/Caches/pip/wheels/e5/2b/75/5f8b2fe9245ff1e4453c3d0d54a4fe19349da6f35aad50ed8d
Successfully built pyppdf litereval
Installing collected packages: litereval, pyppdf
Successfully installed litereval-0.0.11 pyppdf-0

In [9]:
import asyncio
from pyppeteer import launch

browser = await launch()

[W:pyppeteer.chromium_downloader] start patched secure https chromium download.
Download may take a few minutes.
100%|██████████| 86759503/86759503 [00:02<00:00, 36157765.61it/s]
[W:pyppeteer.chromium_downloader] 
chromium download done.
[W:pyppeteer.chromium_downloader] chromium extracted to: /Users/yegor/Library/Application Support/pyppeteer/local-chromium/588429


In [10]:
browser.close()

<coroutine object Browser.close at 0x7fc7c5f2f740>

## Count number of news per ticker

In [104]:
res = c.aggregate([
    {'$unwind':'$news'},
#     {'$project':{'_id':0,'ticker':1,'total_news':{'$size':'$news'}}},
    {'$group':{'_id':'$ticker', 'total_news': {'$sum':1}}},
    {'$sort':{'total_news':-1}},
#     {'$limit':10}
], allowDiskUse=True)

# for r in res:
#     print(r)
pd.DataFrame(list(res))

Unnamed: 0,_id,total_news
0,FOX,6099
1,ADS,1671
2,C,972
3,AAPL,918
4,D,892
...,...,...
500,WAB,17
501,WAT,17
502,VRSN,16
503,BRK.B,9
