# Prepare CSV for autoML

# Clean

In [1]:
def clean_and_tokenize_text(text):
    """
    Cleans `text` from punctuation and special symbols
    """    
    # need to adjust this for better cleaning
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords

    lower_alpha_tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]
    no_stop = [t for t in lower_alpha_tokens if t not in set(stopwords.words('english'))]
    
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(t) for t in no_stop]
    return lemmatized

def quality_check(article):
    """
    Checks for quality of the `article` and returns False if it doesnt' pass the 
    QA check
    """
    
    # need to find ways for better QA
    if article and len(article) > 250: return True
    return False

def concat_news(ticker, period=1):
    """
    Reads all news from the `DB` for the `ticker`, 
    concats by the `period`.
    
    Input
    -----
    period: int, default 1 (day)
    
    Output
    ------
    resulting_dictionary: dict, key–day, value–all news for that day
    Counter(all_tokens)
    """
    from collections import Counter
    
    news = collection.find_one({
        'ticker' : ticker
    })['news']
    
    resulting_dictionary = {}
    
    all_tokens = []
    
    for n in news:
        try:
            date = str(pd.to_datetime(n['datetime']).date())

            tokens = clean_and_tokenize_text(n['text'])

            text = ' '.join(tokens)

            all_tokens.extend(tokens)

            if quality_check(text):
                if date in resulting_dictionary.keys():
                    resulting_dictionary[date] = resulting_dictionary[date] + text
                else:
                    resulting_dictionary[date] = text
        except Exception as e:
            print(f"Cannot process url: {n['url']}\nError: {e}")
    
    return resulting_dictionary, Counter(all_tokens)

# Creating the dataset for AutoML

In [2]:
from yahoo_fin import stock_info as si 
import pymongo as pm
import pandas as pd
from datetime import date

# create dataset for predicting pricing for a company

# set the company ticker
ticker = 'AAPL'

# set interval
start_date = '01/01/2017'
end_date = date.today()

# obtain historical stock pricing data
t_data = si.get_data(ticker, start_date=start_date, end_date=end_date)

# Start MongoDB
# !brew services start mongodb-community@4.2

# Stop MongoDB
# !brew services stop mongodb-community@4.2

# connect to DB
client = pm.MongoClient('mongodb://localhost:27017')
collection = client['news']['recommendations']

# get the dictionary with all news per period (1 day) and BOW (bag of words)
t_news, t_bow = concat_news(ticker, period=1)

# convert to pandas df
t_df = pd.DataFrame.from_dict(t_news, orient='index', columns=['text'])
t_df.index = pd.to_datetime(t_df.index)

# combine news with stock price 'adjclose'
t_combined = pd.concat([t_df, t_data.adjclose], axis=1)

# drop rows without the news
t_combined = t_combined[t_combined.text.notna()]

# fill adjclose for over the weekends and holiday
# logic is the price on monday close is the result of 
# the news over the weekend hence backfill
t_combined.adjclose = t_combined.adjclose.fillna(method='backfill')

# drop rows without the stock price
t_combined = t_combined[t_combined.adjclose.notna()]


t_combined.tail(20)

Cannot process url: http://www.ft.com/cms/s/9c847838-ca25-43cf-a962-0d3588d8d34d,s01=1.html?ftcamp=traffic/partner/feed_headline/us_yahoo/auddev
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.barrons.com/articles/the-stock-market-is-headed-for-a-huge-quarter-is-a-tech-bubble-in-the-making-51592948400?siteid=yhoof2
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.barrons.com/articles/mercedes-benz-nvidia-sign-deal-to-make-cars-more-like-iphones-51592933400?siteid=yhoof2
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.barrons.com/articles/apple-stock-tim-cook-keynote-wwdc-speech-51592929033?siteid=yhoof2
Error: 'NoneType' object has no attribute 'date'
Cannot process url: https://www.reuters.com/article/uk-goldman-sachs-probe-idUKKBN1XK00D
Error: 'datetime'
Cannot process url: https://www.marketwatch.com/story/dow-trades-at-session-low-after-fed-statement-with-less-than-a-half-hour-left-in-

Unnamed: 0,text,adjclose
2020-06-13,dow jones industrial average continued rally s...,342.98999
2020-06-14,warren buffett trade portfolio put protect dow...,342.98999
2020-06-15,secret retail investor stepping participation ...,342.98999
2020-06-16,douglas busvine berlin reuters germany sought ...,352.079987
2020-06-17,earlier week apple nasdaq aapl released result...,351.589996
2020-06-18,printed google logo placed apple macbook illus...,351.730011
2020-06-19,two main headline berkshire hathaway nyse nyse...,349.720001
2020-06-20,dow jones future wo begin trading sunday eveni...,358.869995
2020-06-21,bloomberg opinion rhetoric repeated frequently...,358.869995
2020-06-22,share microsoft nike trading higher monday aft...,358.869995


In [3]:
t_combined.to_csv(f'to_predict/{ticker}.csv', index_label='date')