# Pulling stock data and key milestones (earnings presentations)
# Yahoo Finance API (yahoo_fin) + Beautiful Soup

In [1]:
# Initial imports
import os
import requests # not in use for pulling stock price
import requests_html # not in use for pulling stock price
import pandas as pd
from pathlib import Path

from dotenv import load_dotenv
import alpaca_trade_api as tradeapi

# Yahoo_fin api to pull adjusted close price
import yahoo_fin.stock_info as si
from datetime import date, timedelta
import datetime
# from yahoo_earnings_calendar import YahooEarningsCalendar

# Alternative to pulling the data for earnings calls without yahoo earnings, which does not work
import json
from bs4 import BeautifulSoup

In [2]:
# We use yahoo_fin API to pull adjusted closing prices. We tried Alpaca API, but priecs are not adjusted for stock splits
# We define the tickers and other inputs for the API
ticker_list = ['TSLA', 'QQQ']
start_date='2010-06-29'
end_date=date.today()#'2021-01-30'
index_as_date=True
interval='1d'


# we create an empty data frame where we will store the final data
closing_prices_df = pd.DataFrame()

# Yahoo fin stores data in dictionary. We will create a for loop to pull data for all the tickers we need. We will use the same loop to create the data frame
# We create an empty dictionary where we will store data from the API
historical_data = {}

for ticker in ticker_list:
    historical_data[ticker] = si.get_data(ticker,start_date,end_date,index_as_date,interval)
    closing_prices_df[ticker]=historical_data[ticker]['adjclose']

# Add a title to index
closing_prices_df.index.name = 'date'


In [3]:
# date_from = datetime.datetime.strptime('Jun 29 2020  10:00AM', '%b %d %Y %I:%M%p')
# date_to = datetime.datetime.strptime('Jan 19 2021  10:00AM', '%b %d %Y %I:%M%p')
# type(date_from)
# date_to

In [4]:
# Funciton to pull earnings dates for any stock
# uses Beautiful Soup to scrape data from yahoo finance

def my_earnings_dates(symbol, start_date, end_date):

    #     Define the URL
    url = f"https://finance.yahoo.com/calendar/earnings?from={start_date}&to={end_date}&symbol={symbol}"

    #     prepare start and end date to pass if conditions inside funciton only
    start_date=datetime.datetime.strptime(str(start_date), '%Y-%m-%d')
    end_date=datetime.datetime.strptime(str(end_date), '%Y-%m-%d')
    
    #     Get the data from the URL and use beautiful soup to parse it 
    response_data = requests.get(url)
    soup = BeautifulSoup(response_data.text)
    
    #     Find all the earnings dates associated to the symbol and date
    date_zone=soup.find_all('td',attrs={'aria-label':'Earnings Date'})

    #     start populating the dates and time zones with a for loop
    date_list=[]
    time_zone_list=[]
    
    
    for i in range(0,len(date_zone)):
        
        #     print(all_data[i].text)
        #     Pull the date and time for each earnings call
        date_time=date_zone[i].find_all('span')[0]
        date_time=date_time.text
        date_time=datetime.datetime.strptime(date_time, '%b %d, %Y, %I %p')
         
        if date_time>=start_date and date_time<=end_date:
            #     we convert the format into a time string
            date_time=date_time.strftime('%Y-%m-%d %I:%M%p')

            #     Pull the time zone
            time_zone=date_zone[i].find_all('span')[1]
            time_zone=time_zone.text

            #     append date_time and time_zone 
            date_list.append(date_time)
            time_zone_list.append(time_zone)
    
    
    #    Create data frame with output data
    earnings_dates=pd.DataFrame()
    earnings_dates['date']=date_list
    earnings_dates['time zone']=time_zone
    earnings_dates.sort_values(by=['date'], inplace=True)
    
    return earnings_dates.sort_values(by=['date'])

# Pull earnings dates
# Define ticker
# Start and end date are defined above
symbol= 'TSLA'
# call funciton
earnings_dates_df=my_earnings_dates(symbol, start_date, end_date)

# Make sure to adjust data if needed - adjustment by market hour 

# we change the name to date column - we will drop this field later. We need a date field that shows off market hour tweets as t+1 
earnings_dates_df.rename(columns={'date':'date original'},inplace=True)
earnings_dates_df['date original']=pd.to_datetime(earnings_dates_df['date original'])

# Make earnings calls after 4pm fall into the following day
# Define market hour limit as everything after 16hs 00 min 00 sec
min_hour=16
min_minute=0
min_second=0

# we create the new field equalt to date original 
earnings_dates_df['date']=earnings_dates_df['date original'].copy()

# we add 1 day to date original if the tweet occured off market hours
earnings_dates_df.loc[(earnings_dates_df['date original'].dt.hour>=min_hour) & (earnings_dates_df['date original'].dt.minute>min_minute) & (earnings_dates_df['date original'].dt.second>min_second), 'date'] = earnings_dates_df['date original']+timedelta(days=1)


#Create Flag 1,0 if there was an earnings call on that day
# it will be one in this data set
earnings_dates_df['earnings flag']=1
# earnings_dates_df['earnings flag'].astype('int')

# drop unused fields
earnings_dates_df.drop(columns={'time zone', 'date original'}, inplace=True)
earnings_dates_df.set_index('date', inplace=True)


earnings_dates_df.head()

Unnamed: 0_level_0,earnings flag
date,Unnamed: 1_level_1
2010-11-09,1
2011-02-15,1
2011-05-04,1
2011-08-03,1
2011-11-02,1


In [5]:
# Clean the format of the dates
# closing_prices_df.index = closing_prices_df.index.date

# We use earnings data to create a flag for the days with earnings calls 
closing_prices_df=closing_prices_df.join(earnings_dates_df, how='outer')

# Fill in N/As with 0: we create a boolean variable
closing_prices_df['earnings flag']=closing_prices_df['earnings flag'].fillna(0)

# Change format to integer
closing_prices_df['earnings flag']=closing_prices_df['earnings flag'].astype('int')

closing_prices_df.head(5)



Unnamed: 0_level_0,TSLA,QQQ,earnings flag
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-29,4.778,39.031284,0
2010-06-30,4.766,38.437302,0
2010-07-01,4.392,38.329292,0
2010-07-02,3.84,38.221321,0
2010-07-06,3.222,38.338306,0


In [6]:
save_csv=True
if save_csv:
    # Export csv files with stock prices
    file_name="stock_price.csv"
    output_file = Path(f"../Resources/{file_name}")
    closing_prices_df.to_csv(f"{output_file}")

# We show the code for the Alpaca API, but we do not use it
## Does not adjust for stock splits

In [7]:
# Load .env environment variables
load_dotenv()

True

In [8]:
# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

In [9]:
# Verify that Alpaca key and secret were correctly loaded
print(f"Alpaca Key type: {type(alpaca_api_key)}")
print(f"Alpaca Secret Key type: {type(alpaca_secret_key)}")

Alpaca Key type: <class 'str'>
Alpaca Secret Key type: <class 'str'>


In [10]:
# Create the Alpaca API object
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")