In [108]:
# Author: Michael Djaballah
# Last edited: 5:49 PM June 3, 2020
# Last edited by: Michael Djaballah

import yfinance as yf
import pandas as pd
from datetime import datetime
import os
from time import sleep
from dateutil.relativedelta import relativedelta

In [109]:
data_path = 'data/'
date_format = '%B %d, %Y'
start = '2000-01-01'
interval = '1mo'
prefix = 'monthly/'

In [110]:
# Functions to maintain and call the S&P 500 from a current date
# Author Michael Djaballah
# Time last edited: 5:56 PM June 1, 2020
# Last edited by: Michael Djaballah

# Takes no input
# Output is newly saved CSV's containing the current makeup of the S&P 500 
# and its historical additions and removals
# data_path is changeable depending on desired save location
def get_snp_store(data_path='data/'):
    curr_raw = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    
    curr = curr_raw[0]
    hist = curr_raw[1]
    
    new_hist = pd.DataFrame(hist['Date'])
    new_hist['Added'] = hist['Added', 'Ticker']
    new_hist['Removed'] = hist['Removed', 'Ticker']
    
    os.makedirs(data_path, exist_ok=True)
    
    curr.to_csv(data_path + 'snp_current.csv', index=False)
    new_hist.to_csv(data_path + 'snp_hist.csv', index=False)
    return None


# Input: a date in string form with its corresponding format:
# Ex: 'January 1, 2020', '%B %d, %Y'
# Output: a list containing the S&P 500 at the input date
def build_snp(date, date_format, data_path='data/'):
    curr = pd.read_csv(data_path + 'snp_current.csv')
    hist = pd.read_csv(data_path + 'snp_hist.csv')
    
    start_date = datetime.strptime(date, date_format)
    
    snp_set = set(curr['Symbol'])
    
    for i in range(len(hist)):
        temp_date = datetime.strptime(hist.iloc[i]['Date'], date_format)
        if temp_date < start_date:
            break

        tb_removed = hist.iloc[i]['Added']
        tb_added = hist.iloc[i]['Removed']

        if tb_removed in snp_set:
            snp_set.remove(tb_removed)
        if not type(tb_added) == float:
            snp_set.add(tb_added)
    
    return list(snp_set)

In [121]:
# Functions to obtain data from yfinance
# Author Michael Djaballah
# Time last edited: 5:40 PM June 3, 2020
# Last edited by: Michael Djaballah

# Takes a list of tickers as strings
# Output is newly saved CSV's with one CSV per company 
# Saved in specific directory
# data_path is changeable depending on desired save location
def get_data(tickers, data_path = 'data/', start = '2000-01-01', interval = '1mo', prefix = 'monthly/', return_bad_tickers=False):
    bad_tickers = []
    
    os.makedirs(data_path + prefix, exist_ok=True)
    
    curr_tickers = set(os.listdir(data_path + prefix))

    for ticker in tickers:
        ticker_label = ticker + '.csv'
        
        if ticker_label not in curr_tickers:
            temp_ticker = yf.Ticker(ticker)
            temp_hist = temp_ticker.history(start=start, interval=interval)
            temp_hist.dropna(axis=0, inplace=True)
            temp_hist.to_csv(data_path + prefix + ticker_label)
            
            if len(temp_hist) < 90:
                bad_tickers.append((ticker, len(temp_hist)))
            sleep(4)
    
    if return_bad_tickers:
        return bad_tickers
    
    return None

In [123]:
# Functions to manipulate and extract desired data from data saved with "get_data"
# Author Michael Djaballah
# Time last edited: 5:45 PM June 3, 2020
# Last edited by: Michael Djaballah


# Takes a ticker as a string
# Output is either a dataframe with desired data, or False, indicating that there was not enough data to build with the desired offset
# data_path is changeable depending on desired save location
def check_ticker(ticker, offset, data_path='data/', prefix='monthly/'):
    ticker_df = pd.read_csv(data_path + prefix + ticker + '.csv')
    if len(ticker_df) >= offset:
        return ticker_df
    return False


# Takes a list of tickers as strings, desired features from those tickers, the test depth and historical depth
# Output is a dictionary of data frames: key = ticker string, value = dataframe
# data_path is changeable depending on desired save location
def build_portfolio(tickers, features, test_depth, hist_depth, data_path='data/', start='2000-01-01', interval='1mo', prefix='monthly/'):
    offset = test_depth + hist_depth + 60 + 6
    
    get_data(tickers, data_path=data_path, start=start, interval=interval, prefix=prefix)
    
    features = ['Date'] + features
    
    ticker_dict = {}
    
    for ticker in tickers:
        temp_ticker_df = check_ticker(ticker, offset)
        if type(temp_ticker_df) != bool:
            ticker_df = temp_ticker_df[features]
            ticker_dict[ticker] = ticker_df
    
    return ticker_dict


# Takes a portfolio (from 'build_portfolio'), a desired ticker, date desired to predict on, depth desired
# Output is a dataframe with one row or the desired features from previous dates
# Can change 'keep_pred' to True if training or False if predicting
# Target value is present in 'Target' column if 'keep_pred' = True
def build_feature_vector(portfolio, ticker, date, hist_depth, target='Close', keep_pred=True):
    ticker_df = portfolio[ticker]
    
    start_date_dt = datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=hist_depth)
    start_date = start_date_dt.strftime('%Y-%m-%d')
    
    feature_df = ticker_df.set_index('Date')[start_date:date].reset_index(drop=True)
    
    new_df = {}
    
    for i in range(len(feature_df)):
        for col in feature_df.columns:
            if i < len(feature_df) - 1:
                new_df_dict[col + ' ' + str(i + 1)] = [feature_df[col].iloc[i]]
            elif col == target:
                if keep_pred:
                    new_df_dict['Target'] = [feature_df[col].iloc[i]]
                    
    new_df = pd.DataFrame.from_dict(new_df_dict)
    
    if keep_pred:
        new_df = new_df[[col for col in new_df.columns if col not in {'Target'}] + ['Target']]
    
    return new_df

In [124]:
get_data(['AAPL', 'MSFT'])

In [125]:
port = build_portfolio(['MSFT', 'AAPL'], ['Close', 'Volume'], 3, 24)

In [126]:
build_feature_vector(port, 'MSFT', '2015-01-01', 12)

Unnamed: 0,Close 1,Volume 1,Close 2,Volume 2,Close 3,Volume 3,Close 4,Volume 4,Close 5,Volume 5,...,Volume 8,Close 9,Volume 9,Close 10,Volume 10,Close 11,Volume 11,Close 12,Volume 12,Target
0,32.88,930226200.0,33.29,705304500.0,35.89,778425700.0,35.37,746112500.0,35.84,574362900.0,...,513919700.0,41.13,860827300.0,41.66,853260900.0,42.42,523005900.0,41.47,626771200.0,36.07


In [119]:
universe = build_snp('January 1, 2015', date_format)

In [104]:
universe

['GL',
 'DG',
 'NDAQ',
 'JNJ',
 'MON',
 'RTN',
 'MAC',
 'ATI',
 'OI',
 'ADP',
 'HWM',
 'XLNX',
 'PEAK',
 'RL',
 'COG',
 'CNP',
 'DUK',
 'PG',
 'AEE',
 'RCL',
 'ORCL',
 'EBAY',
 'EA',
 'MLM',
 'V',
 'NOC',
 'F',
 'BKR',
 'MUR',
 'ADBE',
 'COV',
 'D',
 'DTV',
 'PGR',
 'NVDA',
 'ALL',
 'DFS',
 'WRK',
 'DLTR',
 'AMGN',
 'VIAB',
 'SNA',
 'GME',
 'ABC',
 'FMC',
 'CI',
 'ADM',
 'ESRX',
 'SWN',
 'PLL',
 'APH',
 'L',
 'MCK',
 'WYNN',
 'GT',
 'XRAY',
 'PEP',
 'IQV',
 'RHT',
 'VZ',
 'HCBK',
 'PCAR',
 'WM',
 'SNDK',
 'POM',
 'CVX',
 'NSC',
 'MAT',
 'DIS',
 'AMCR',
 'FITB',
 'OXY',
 'JPM',
 'GRMN',
 'MNK',
 'SIAL',
 'TT',
 'RIG',
 'KSS',
 'MPC',
 'FTI',
 'VNO',
 'WU',
 'BF.B',
 'AIZ',
 'ABT',
 'FLIR',
 'URBN',
 'MDLZ',
 'AFL',
 'AME',
 'EXC',
 'GNW',
 'KSU',
 'EW',
 'PRGO',
 'VRTX',
 'BRK.B',
 'ROST',
 'AVP',
 'JOY',
 'BKNG',
 'TROW',
 'IP',
 'TRIP',
 'GM',
 'LHX',
 'BLL',
 'PRU',
 'CMA',
 'DISCK',
 'REGN',
 'NFX',
 'HOT',
 'EIX',
 'NTAP',
 'SJM',
 'SE',
 'DISCA',
 'AVY',
 'TDC',
 'GOOGL',
 'CELG',