# Analysis of quantitative market variables of US Equities around earnings seasons
### Authored by Sarang Balan under guidance of Jean-Phillipe Maltais

This python notebook conducts a quantitative and qualitative analysis around the market variables of public US companies to discover, analyse and understand potential patterns in the markets during earning seasons

### Importing Libraries

In [1]:
### importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import yfinance as yf
import html5lib
import datetime
import time
from datetime import date
from datetime import datetime
from datetime import timedelta



In [22]:
import eventlet
eventlet.monkey_patch()

In [2]:
from contextlib import contextmanager
import sys, os

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

### Collecting earnings dates

The yfinance library accesses earnings reports of public companies from the Yahoo Finance website. Note, this library and API access is not officially related to Yahoo, and the data is of acceptable quality for the purposes of this project. 

First step is to create a python function that takes in tickers and start/end date and outputs precise dates of the earnings 

In [3]:
def get_earnings_dates(ticker: str, start_date: str, end_date: str):
    """
    This function takes three arguments: stock ticker, start date, and end date - to return a list of datetime format
    strings of earning release dates within the given date period.
    
    Arguments:
        ticker (str): Ticker of the Equity
        start_date (str/datetime): Start date after which to bring up and save the dates of future releases
        end_date (str/datetime): End date before which all earnings release dates have to be saved
    
    returns:
        list_dates: list of dates of earning releases
        myepslist: list of EPS relative performance (actual vs predictive) in %
        
    """
    
    stock = yf.Ticker(ticker)
    
    df = stock.earnings_dates
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    df.index = df.index.tz_localize(None)
    
    mask = (df.index > start_date) & (df.index<= end_date)
    mydf = df.loc[mask]
    
    list_dates = list(mydf.index)
    
    myepslist = list(mydf['Surprise(%)'])
    
    return list_dates, myepslist
    

### Collecting stock prices

Now, given a list of dates for earning releases, we need to create a function that pulls in hourly stock prices from T - 2 trading days and T + 2  trading days, where T is the market closing time of the earnings release. 

In [4]:
def get_prices(ticker_history, index_history, t_date: datetime, find_percentage_change: bool=True, ):
    """
    Captures hourly prices 14 trading hours before market close (day of release) and 14 trading hours after next open
    
    Arguments:
        tracker_history: Ticker of the stock 
        t_date (datetime): Date of earnings release
        find_percentage_change (bool): Return list in terms of % change from first hour price recorded (T-2 days)
        
    return:
        pricelist: list of prices (or % changes) over 5 trading days
        timelist: list of times corresponding to pricelist items
        indexlist: list of index % changes over 5 trading days
        
    """
    
    #Data of each stock and index
    
    stock = yf.Ticker("MSFT")



    # get historical market data
    hist = ticker_history
    data = hist
    
    
    indexdata = index_history #yf.download('^GSPC', interval='1h', period='2y')
    
    #List of dates and prices to return
    mylist = []
    timelist = []
    indexlist = []
    
    #Earnings release date is given argument in the function
    T = t_date
    
    #Loop over T-2 to T+2 days
    for delta in range(-2,3,1):
    
         #Get data from the current day being analysed in the loop
        Tnow = T + timedelta(days=delta)
        Tnow = Tnow.to_pydatetime().date().strftime('%Y-%m-%d')
        mydf = data.loc[Tnow]
        myindexdf = indexdata.loc[Tnow]
       
       
        #If weekend, move onto the next appropriate day and try again
        if mydf.empty and delta < 0:
            Tnow = T + timedelta(days=delta-2)
            Tnow = Tnow.to_pydatetime().date().strftime('%Y-%m-%d')
            mydf = data.loc[Tnow]
            myindexdf = indexdata.loc[Tnow]
            
        if mydf.empty and delta > 0:
            Tnow = T + timedelta(days=delta+2)
            Tnow = Tnow.to_pydatetime().date().strftime('%Y-%m-%d')
            mydf = data.loc[Tnow]
            myindexdf = indexdata.loc[Tnow]
            
            
        
            
            
        mylist.extend(list(mydf.Close))
        indexlist.extend(list(myindexdf.Close))
        timelist.extend(list(mydf.index))
    
    
    if find_percentage_change:
        pricelist = [x / mylist[0] for x in mylist]
        indexlist = [x /indexlist[0] for x in indexlist]
    
    return pricelist, indexlist

### Creating a function that retrieves US Treasury Bond data

One very important market variable is the current performance of US Treasury bond yields. This is a useful indicator for risk-free interests as well as an indicator of economic confidence.

In [5]:
def get_treasury_yield(period: int):
    """
    Retrieves US Treasury Bond yield for a given period of time at a given date
    
    Arguments:
        period (int): 5,7, or 10 year yield
        date (list): list of dates in YYYY-MM-DD format for which to pick yield data out of
    
    returns:
        close_yield: Close yield for that particular bond on that particular set of dates
    """
    
    bond_dict = {
        10: '^TNX',
        5: '^FVX',
        30: '^TVX'
        
    }
    
    selectedbond = bond_dict[period]
    
    bill = yf.Ticker(selectedbond)
    hist = bill.history(period="max")
    
   
    return hist

In [6]:
# Ticker,  Prices from T-2 to T+2, SPX Index change from T-2 to T+2,  Treasury bill 10 yr yield, Treaury bill 10yr - 2yr, Sector, Rating, EPS consensus

### Retrieving Sector Data

Sector of the particular Equity can be retrieved from the following code

In [7]:
def get_sector(ticker: str):
    stock = yf.Ticker(ticker)
    
    return stock.info['sector']

### Retrieving past quarter EPS

Retrieving EPS data of the year-over-year quarter. 

In [8]:
stock = yf.Ticker('MSFT')

stock

yfinance.Ticker object <MSFT>

### Creating a section of our final database

This section will collect all the information pertaining to one company's past 7 or 8 earning seasons and place it in a df of standard format. The written code here can then be used repeatedly for tickers of other companies

In [35]:
# Ticker,  Prices from T-2 to T+2, SPX Index change from T-2 to T+2,  Treasury bill 10 yr yield, Treaury bill 10yr - 2yr, Sector, EPS consensus
colnames = ['date', 'ticker', 'TB10yr', 'TBslope', 'sector', 'EPS', 'stock_price', 'snp_price']
mydict = {k: [] for k in colnames}
mydict

{'date': [],
 'ticker': [],
 'TB10yr': [],
 'TBslope': [],
 'sector': [],
 'EPS': [],
 'stock_price': [],
 'snp_price': []}

In [17]:
stocklist = pd.read_csv('constituents.csv')
stocklist = stocklist[stocklist.Sector != 'Financials']
stocklist

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A. O. Smith,Industrials
2,ABT,Abbott Laboratories,Health Care
3,ABBV,AbbVie,Health Care
4,ABMD,Abiomed,Health Care
...,...,...,...
499,XYL,Xylem,Industrials
500,YUM,Yum! Brands,Consumer Discretionary
501,ZBRA,Zebra Technologies,Information Technology
502,ZBH,Zimmer Biomet,Health Care


In [18]:
TenYearYields = get_treasury_yield(period=10)
FiveYearYields = get_treasury_yield(period=5)

In [19]:
indexhist = yf.download('^GSPC', interval='1h', period='2y')

[*********************100%***********************]  1 of 1 completed


In [24]:
failedlist = []

mytickers = yf.Tickers(list(stocklist['Symbol'])[15:80])


for idx, (tick, v) in enumerate(mytickers.tickers.items()):
    start_time = time.time()
    #Data of each stock and index
    stock = v
    
    
    # get historical market data
    hist = stock.history(period='2y', interval='1h')
    print(tick)
    try:
        mylist, myepslist = get_earnings_dates(ticker=tick, start_date='2020-08-26', end_date='2022-08-26')
        
        
    
        
        print('Price data recieved')
        
    except:
        print(f'bruh {tick}')
        failedlist.append(tick)
        continue
    
    #print(f'ticker history downloaded for {ticker}')
    data = hist
    mydict['date'].append(mylist)
    #print(mydict)
    
    mydict['EPS'].append(myepslist)
    #print(mydict)
    
    #continue
    for item in mydict['date'][idx]:
        
        with suppress_stdout():
            pricelist, indexlist = get_prices(ticker_history=hist, index_history=indexhist, t_date=item )
        #print(f'price list obtained from {ticker}')
        
        mydict['ticker'].append(tick)
        mydict['stock_price'].append(pricelist)
        mydict['snp_price'].append(indexlist)
    #print('sector recieved')
    mydict['sector'].append(get_sector(tick))
    listofyields = []
    
    #mydict['TB10yr'].append(tenyear)
    #mydict['TBslope'].append(np.subtract(tenyear,fiveyear ))
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    

ALK
Price data recieved
--- 23.483407497406006 seconds ---
ALB
bruh ALB
ARE
bruh ARE
ALGN
bruh ALGN
ALLE
bruh ALLE
LNT
bruh LNT
GOOGL
bruh GOOGL
GOOG
bruh GOOG
MO
bruh MO
AMZN
bruh AMZN
AMCR
bruh AMCR
AEE
bruh AEE
AAL
bruh AAL


KeyboardInterrupt: 

In [33]:
mytickers = yf.Tickers(list(stocklist['Symbol'])[100:200])


for idx, (tick, v) in enumerate(mytickers.tickers.items()):
    print(tick)
    if tick in mylistd:
        continue
    try:
        mylist, myepslist = get_earnings_dates(ticker=tick, start_date='2020-08-26', end_date='2022-08-26')
    except:
        continue
    mylistd[tick] = mylist
    myepsd[tick] = myepslist
    print(f'retrieved {tick}')

CMCSA
retrieved CMCSA
CAG
retrieved CAG
COP
retrieved COP
ED
retrieved ED
STZ
retrieved STZ
CPRT
retrieved CPRT
GLW
retrieved GLW
CTVA
retrieved CTVA
COST
retrieved COST
CTRA
retrieved CTRA
CCI
retrieved CCI
CSX
retrieved CSX
CMI
retrieved CMI
CVS
retrieved CVS
DHI
retrieved DHI
DHR
retrieved DHR
DRI
DVA
DE
DAL
XRAY
DVN
DXCM
FANG
DLR
DISCA
DISCK
DISH
DG
DLTR
D
DPZ
DOV
DOW
DTE
DUK
DRE
DD
retrieved DD
DXC
retrieved DXC
EMN
retrieved EMN
ETN
retrieved ETN
EBAY
retrieved EBAY
ECL
retrieved ECL
EIX
retrieved EIX
EW
EA
LLY
EMR
ENPH
ETR
EOG
EFX
EQIX
EQR
ESS
EL
ETSY
EVRG
ES
EXC
EXPE
EXPD
EXR
XOM
FFIV
FB
retrieved FB
FAST
retrieved FAST
FRT
retrieved FRT
FDX
retrieved FDX
FIS
retrieved FIS
FE
retrieved FE
FISV
retrieved FISV
FLT
retrieved FLT
FMC
F
FTNT
FTV
FBHS
FOXA
FOX
FCX
GPS
GRMN
IT
GNRC
GD
GE
GIS
GM
GPC
GILD
GPN
HAL
HBI
HAS
retrieved HAS
HCA
retrieved HCA
PEAK
retrieved PEAK
HSIC
retrieved HSIC
HES
retrieved HES
HPE
retrieved HPE


In [34]:
import pickle

filehandler = open("mylist.obj","wb")
pickle.dump(mylistd,filehandler)
filehandler.close()


filehandler = open("myeps.obj","wb")
pickle.dump(myepsd,filehandler)
filehandler.close()

In [42]:
failedlist = []


for idx, (tick, mylist) in enumerate(mylistd.items()):
    start_time = time.time()
    
    if tick in mydict['ticker']:
        print('skipped')
        continue
        
    print(tick)
    try:
        #Data of each stock and index
        stock = yf.Ticker(str(tick))
    
        # get historical market data
        hist = stock.history(period='2y', interval='1h')
        mylist = mylist
        myepslist = myepsd[tick]
        
            #print(f'ticker history downloaded for {ticker}')
        data = hist
        mydict['date'].append(mylist)
        #print(mydict)
    
        mydict['EPS'].append(myepslist)
        #print(mydict)
    
        #continue
        for item in mydict['date'][idx]:
        
            with suppress_stdout():
                pricelist, indexlist = get_prices(ticker_history=hist, index_history=indexhist, t_date=item )
        #print(f'price list obtained from {ticker}')
        
            mydict['ticker'].append(tick)
            mydict['stock_price'].append(pricelist)
            mydict['snp_price'].append(indexlist)
        #print('sector recieved')
        mydict['sector'].append(get_sector(tick))
        listofyields = []
    
        
        print('Price data recieved')
        
    except:
        print(f'bruh {tick}')
        failedlist.append(tick)
        continue
    
    
    #mydict['TB10yr'].append(tenyear)
    #mydict['TBslope'].append(np.subtract(tenyear,fiveyear ))
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    

skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
BAX
Price data recieved
--- 13.26800274848938 seconds ---
BDX
Price data recieved
--- 12.954616785049438 seconds ---
BBY
Price data recieved
--- 13.341441869735718 seconds ---
BIO
Price data recieved
--- 9.21352767944336 seconds ---
TECH
Price data recieved
--- 9.140979051589966 seconds ---
BIIB
Price data recieved
--- 8.912170648574829 seconds ---
BA
Price data recieved
--- 8.113578081130981 seconds ---
CDW
Price data recieved
--- 7.538984298706055 seconds ---
CE
Price data recieved
--- 9.08642053604126 seconds ---
CNC
Price data recieved
--- 10.06001091003418 seconds ---
CNP
Price data recieved
--- 9.34192180633545 seconds ---
CDAY
Price data recieved
--- 8.281807661056519 seconds ---
CF
Price data recieved
--- 7.714886426925659 seconds ---

In [37]:
len(mylistd)

104

In [43]:
len(set(mydict['ticker']))

93

In [44]:
import pickle

filehandler = open("mydict.obj","wb")
pickle.dump(mydict,filehandler)
filehandler.close()
