In [5]:
from bs4 import BeautifulSoup
import urllib2

In [6]:
import datetime as dt
import pandas as pd, numpy as np, datetime
from helpers import *

In [4]:
def earnings_yahoo(dateToCheck, printURL = False):
    """Scape yahoo finance for earnings data for a single date.  It doesnt scrape for a particular symbol."""

    #yahoo finance has the earnings data on a page which has a range of dates. A range is required for the page to load.
    #prepare the dates
    startDate =  dateToCheck + dt.timedelta(days=-6)
    endDate =  dateToCheck
    startDateStr = startDate.strftime("%Y-%m-%d")
    endDateStr = endDate.strftime("%Y-%m-%d")
    dateToCheckStr = dateToCheck.strftime("%Y-%m-%d")
    
    #prepare the URL for scraping.
    quote_page = 'https://finance.yahoo.com/calendar/earnings?from='+startDateStr+'&to='+endDateStr+'&day='+dateToCheckStr
    if printURL:
        print quote_page
    
    #open the URL using urllib and beautiful soup. parse the page an scrape for the particular table.
    page = urllib2.urlopen(quote_page)
    soup = BeautifulSoup(page, 'html.parser')
    name_box = soup.find('table', attrs={'class': 'data-table W(100%) Bdcl(c) Pos(r) BdB Bdc($c-fuji-grey-c)'})
    if name_box == None:
        return None
    table_body = name_box.find('tbody')

    #generate the dataset using the table if it was found.
    data = []
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
    
    #generate the list of headers for the table.
    header = []
    cols = name_box.find('thead').find_all('tr')[0].find_all('span')
    cols = [ele.text.strip() for ele in cols]
    header.append([ele for ele in cols if ele])
    header = header[0]
    
    #convert to a dataframe for the data to be returned
    returnedData = pd.DataFrame(data, columns=header)
    returnedData['date'] = dateToCheck
    
    return returnedData

In [92]:
def earnings_fixTypes(earnings):
    '''correct the data types for the earnings dataset'''
    earnings['EPS Estimate'] = pd.to_numeric(earnings['EPS Estimate'], errors='coerce')
    earnings['Reported EPS'] = pd.to_numeric(earnings['Reported EPS'], errors='coerce')
    earnings['Surprise(%)'] = pd.to_numeric(earnings['Surprise(%)'], errors='coerce')
    earnings['Symbol'] = earnings['Symbol'].astype(str)
    return earnings

In [207]:
def earnings_yahoo_range(start, end, printURL = False, fixDataTypes=True):
    """Scape yahoo finance for earnings data between two dates. It doesnt scrape for a particular symbol."""

    #prepare dates
    startDate = dt.datetime.strptime(start, '%Y-%m-%d')
    endDate = dt.datetime.strptime(end, '%Y-%m-%d')

    earningsData = pd.DataFrame()
    
    #loop through each day in the date range and call earnings_yahoo to compile the data.
    totalDays = (endDate-startDate).days+1
    for i in range(0,totalDays):
        dateToRetrieve = startDate + dt.timedelta(days=i)
        helpers.progress('Processing earnings: ' + str(dateToRetrieve) + ' | Day: ' + str(i+1) + ' of ' + str(totalDays))
        retrievedDF = earnings_yahoo(dateToRetrieve, printURL)
        if retrievedDF is not None:
            earningsData = earningsData.append(retrievedDF, printURL)
    
    earningsData['announce'] = ""
    earningsData.loc[earningsData['Earnings Call Time'] == "Before Market Open",'announce'] = "0:00"
    earningsData.loc[earningsData['Earnings Call Time'] == "After Market Close",'announce'] = "23:59"

    #remove the items that didnt match "Before Market Open / Close" -- note this could be fixed later by parsing the date str
    #for example, if it says "12PM" then we are ignoring the data
    earningsData = earningsData[earningsData['announce'] <> ""] 

    #convert the announcement date and the time to a "dt" value representing the time of the announcement. 
    #We assume an overnight announcement is at midnight. 
    earningsData['dtString'] = earningsData['date'].astype(str) + ' ' + earningsData['announce'].astype(str)
    earningsData['dt'] = pd.to_datetime(earningsData['dtString'], format='%Y-%m-%d %H:%M')
    
    #Drop the date and annouce columns. And dtString isnt required anymore
    earningsData = earningsData.drop(['dtString'],axis=1)
    earningsData = earningsData.drop(['date'],axis=1)
    earningsData = earningsData.drop(['announce'],axis=1)
    earningsData = earningsData.drop(['Earnings Call Time'],axis=1)
    earningsData = earningsData.drop(['Company'],axis=1)
    
    if fixDataTypes:
        earningsData = earnings_fixTypes(earningsData)
    return earningsData

In [8]:
#prices_google(intervalMins=30, periodDays=90, symbol='AAPL', printURL=True)

In [2]:
def prices_google(intervalMins, periodDays, symbol, intervalMins=30, periodDays=90, printURL = False):
    """Scape google finance for price data for one symbol. interval is in minutes, period is number of days."""
    """Example URL: https://www.google.com/finance/getprices?q=AAPL&i=86400&p=14d&f=d,o,h,l,c,v"""
    
    #prepare parameters
    interval = intervalMins*60
    period = str(periodDays)+'d'
    url = "http://www.google.com/finance/getprices?q="+symbol+"&i="+str(interval)+"&p="+period+"&f=d,o,h,l,c,v"
    
    #print the URL if requested
    if printURL:
        print url
    
    #scrape data using read_csv
    data=list(np.array(pd.read_csv(url,skiprows=7,header=None)))
    
    #declare arrays - date is for capturing timestamps, proc is for the processed data.
    date = []
    proc = []
    
    #loop through the scraped data and append to date and proc arrays.
    #some processing is required because of the google format. it has offsets that must be added.
    for i in range(0,len(data)):
        if data[i][0][0]=='a':
            t= datetime.datetime.fromtimestamp(int(data[i][0].replace('a','')))
            date.append(t)
            proc.append(data[i])
        elif data[i][0][0]=='T':
            continue
        else:
            date.append(t+datetime.timedelta(minutes=int(data[i][0])*interval/60))
            proc.append(data[i])
    
    #prepare dataframe for returning from function.
    final=pd.DataFrame(proc,index=date)
    final.columns=['a','Open','High','Low','Close','Vol']
    final['Symbol'] = symbol
    final = final.drop(columns=["a"])
    return final


In [210]:
def prices_google_symbols(symbols, intervalMins=30, periodDays=90):
    '''Scrape google for price data with a list of symbols.'''
    prices = pd.DataFrame()
    
    #i and total are for returning the progress statement
    i, total = 0, len(symbols)
    
    #loop through the list of symbols (dataframe) and call the prices_google function to scrape the data.
    for index, row in symbols.iterrows():
        i+=1
        symbol = row['Symbol']
        helpers.progress('Processing prices: ' + symbol + ' | Progress: ' + str(i) + ' of ' + str(total))
        symbol_price = prices_google(intervalMins=intervalMins, periodDays=periodDays, symbol=symbol)
        prices = prices.append(symbol_price)
    return prices

In [212]:
def stocksummary_yahoo(symbol):
    '''Scrape yahoo finance for the stock data summary such as PEratio, Avg Volume, Market Cap for a single symbol.'''
    
    #prepare URL
    quote_page = 'https://finance.yahoo.com/quote/'+symbol+'/'
    
    #setup the web scraper
    page = urllib2.urlopen(quote_page)
    soup = BeautifulSoup(page, 'html.parser')
    
    #find the quote-summary div to return the table
    name_box = soup.find('div', attrs={'id': 'quote-summary'})
    table_body= name_box
    data = []
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
    
    x = pd.DataFrame(data)
    x = x.transpose()
    x.columns = x.iloc[0]
    x = x.reindex(x.index.drop(0))
    x.insert(0,"Symbol", value=symbol)
    return x

In [470]:
def stocksummary_fixTypes(stocksummary):
    '''correct the data types and columns for the stock summary dataset'''
    
    #convert and split text for applicable columns
    stocksummary['Forward Dividend'] = stocksummary['Forward Dividend & Yield'].apply(lambda x: x.split(' (')[0])
    stocksummary['Forward Yield'] = stocksummary['Forward Dividend & Yield'].apply(lambda x: x.split(' (')[1][:-2])
    stocksummary['Bid Price'] = stocksummary['Bid'].apply(lambda x: x.split(' x ')[0])
    stocksummary['Bid Lot'] = stocksummary['Bid'].apply(lambda x: x.split(' x ')[1])
    stocksummary['Ask Price'] = stocksummary['Ask'].apply(lambda x: x.split(' x ')[0])
    stocksummary['Ask Lot'] = stocksummary['Ask'].apply(lambda x: x.split(' x ')[1])
    stocksummary['Day Range Min'] = stocksummary["Day's Range"].apply(lambda x: x.split(' - ')[0])
    stocksummary['Day Range Max'] = stocksummary["Day's Range"].apply(lambda x: x.split(' - ')[1])
    stocksummary['52 Week Range Min'] = stocksummary['52 Week Range'].apply(lambda x: x.split(' - ')[0])
    stocksummary['52 Week Range Max'] = stocksummary['52 Week Range'].apply(lambda x: x.split(' - ')[1])
    
    #convert columns to floats if they are actually numerical.
    columnsToFloat = ['Previous Close', 'Open', 'Volume', 'Avg. Volume', 'Market Cap', 'Beta',
       'PE Ratio (TTM)', 'EPS (TTM)', '1y Target Est','Forward Dividend','Forward Yield',
       'Bid Price', 'Bid Lot', 'Ask Price', 'Ask Lot','Day Range Min','Day Range Max',
        '52 Week Range Min','52 Week Range Max']
    stocksummary[columnsToFloat] = stocksummary[columnsToFloat].applymap(lambda x: text_to_num(x))

    #fix fields that should be in percentage
    stocksummary['Forward Yield'] = stocksummary['Forward Yield']/100
    
    return stocksummary

In [471]:
def stocksummary_yahoo_symbols(symbols, fixDataTypes=True):
    '''Scrape yahoo for stock summary with a list of symbols.'''
    
    #declare variables to hold data for function.
    stocksummary = pd.DataFrame()
    success = []
    fail = []
    
    #i and total are for returning the progress statement
    i, total = 0, len(symbols)
    
    #loop through list of symbols and call the function to scrape yahoo.
    for index, row in symbols.iterrows():
        i+=1
        symbol = row['Symbol']
        helpers.progress('Processing stock summary: ' + str(symbol) + ' | Progress: ' + str(i) + ' of ' + str(len(symbols)))
        try:
            scrape = stocksummary_yahoo(symbol=symbol)
            stocksummary = stocksummary.append(scrape) 
            success.append(symbol)
        except:
            fail.append(symbol)

    if fixDataTypes:
        stocksummary = stocksummary_fixTypes(stocksummary)
    
    print '\nsuccess: ' + str(len(success)) + ' | ' + 'fail: ' + str(len(fail))
    
    return stocksummary, success, fail