In [23]:
def clean_and_tokenize_text(text):
    """
    Cleans `text` from punctuation and special symbols
    """    
    # need to adjust this for better cleaning
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords

    lower_alpha_tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]
    no_stop = [t for t in lower_alpha_tokens if t not in set(stopwords.words('english'))]
    
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(t) for t in no_stop]
    return lemmatized

def quality_check(article):
    """
    Checks for quality of the `article` and returns False if it doesnt' pass the 
    QA check
    """
    
    # need to find ways for better QA
    if article and len(article) > 150: return True
    return False

def concat_news(ticker, period=1):
    """
    Reads all news from the `DB` for the `ticker`, 
    concats by the `period`.
    
    Input
    -----
    period: int, default 1 (day)
    
    Output
    ------
    resulting_dictionary: dict, key–day, value–all news for that day
    Counter(all_tokens)
    """
    from collections import Counter
    
    news = collection.find_one({
        'ticker' : ticker
    })['news']
    
    resulting_dictionary = {}
    
    all_tokens = []
    
    for n in news:
        try:
            date = str(pd.to_datetime(n['datetime']).date())

            tokens = clean_and_tokenize_text(n['text'])

            text = ' '.join(tokens)

            all_tokens.extend(tokens)

            if quality_check(text):
                if date in resulting_dictionary.keys():
                    resulting_dictionary[date] = resulting_dictionary[date] + text
                else:
                    resulting_dictionary[date] = text
        except Exception as e:
            print(f"Cannot process url: {n['url']}\nError: {e}")
    
    return resulting_dictionary, Counter(all_tokens)

In [3]:
def collect_news_urls(ticker):
    """Grabs news URLs from Yahoo news.
    
    Parameters
    ----------
    ticker: str, ticker symbol to collect news for
    
    Output
    ------
    url: list, list of URLs
    """
    
    from selenium import webdriver
    import time
    from random import randint
    from selenium.webdriver.firefox.options import Options

    js = """var scrollingElement = (document.scrollingElement || document.body);
                    scrollingElement.scrollTop = scrollingElement.scrollHeight;"""

    url = f'https://finance.yahoo.com/quote/{ticker}'
    urls_list = set()
    
    try: 
        options = Options()
        options.add_argument("--headless")
        browser = webdriver.Firefox(options=options, executable_path=r'geckodriver')
        browser.get(url)
        logging.info(f'Headless Firefox Initialized for URL: {url}')
        browser.execute_script(js)
        time.sleep(randint(1,10))
        items_list = browser.find_elements_by_xpath('//h3/a')
        num_of_items = 0
        while num_of_items != len(items_list):
            num_of_items = len(items_list)
            browser.execute_script(js)
            time.sleep(randint(3,5))
            items_list = browser.find_elements_by_xpath('//h3/a')

        logging.info(f'Found {len(items_list)} urls for {ticker}')

        for item in items_list:
            urls_list.add(item.get_attribute('href'))

        logging.debug(f'Found {len(urls_list)} urls.')
    except Exception as e:
        logging.info(f'Extracting URLs failed. Error:\n{e}')

    browser.quit()
    return list(urls_list)

In [6]:
import logging
import sys

logger = logging.getLogger(__name__)
c_handler = logging.StreamHandler()
c_handler.setLevel(logg)
c_handler.setFormatter(logging.Formatter('%(asctime)s| %(message)s')) 

logging.basicConfig(
    format=f'
    ,datefmt='%y-%m-%d %H:%M:%S'
    ,level=logging.INFO)

logging.info('testing')
# ticker = 'AAPL'
# t_urls = collect_news_urls(ticker)
# t_urls[:5]

20-07-01 12:50:44| testing


In [2]:
from yahoo_fin import stock_info as si 
import pymongo as pm
import pandas as pd
from datetime import date

# create dataset for predicting pricing for a company

# set the company ticker
ticker = 'AAPL'

# set interval
start_date = '01/01/2019'
end_date = date.today()

# obtain historical stock pricing data
t_data = si.get_data(ticker, start_date=start_date, end_date=end_date)

# Start MongoDB
# !brew services start mongodb-community@4.2

# Stop MongoDB
# !brew services stop mongodb-community@4.2

# connect to DB
client = pm.MongoClient('mongodb://localhost:27017')
collection = client['news']['recommendations']

# get the dictionary with all news per period (1 day) and BOW (bag of words)
t_news, t_bow = concat_news(ticker, period=1)

# convert to pandas df
t_df = pd.DataFrame.from_dict(d, orient='index', columns=['text'])
t_df.index = pd.to_datetime(t_df.index)

# combine news with stock price 'adjclose'
t_combined = pd.concat([t_df, t_data.adjclose], axis=1)

# fill adjclose for over the weekends and holiday
# logic is the price on monday close is the result of 
# the news over the weekend hence backfill
t_combined.adjclose = t_combined.adjclose.fillna(method='backfill')

# drop rows without the stock price
t_combined = t_combined[t_combined.adjclose.notna()]
t_combined.tail(20)

NameError: name 'concat_news' is not defined

In [71]:
t_combined.to_csv(f'to_predict/{ticker}.csv', index_label='date')