In [118]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import chromedriver_binary
import lxml
from lxml import html

# Scanning Earnings

This Notebook contains a script that scans https://finance.yahoo.com/calendar/earnings/ for company's that are releasing their earnings today. It will return a dataframe containing the tickers that are releasing their earnings. 

This list of tickers will then subsequently be used to check if any of the stocks are currently consolidating using our function from the "Signals" notebook. 

### Upcoming Earnings for all US Companies

In [196]:
# initialising empty dataframe which will hold all tickers
df = pd.DataFrame()

for i in range(1, 11):

    # the URL 
    url = "https://seekingalpha.com/earnings/earnings-calendar/{}".format(i)

    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url, headers=headers).text

    # get the HTML content
    soup = BeautifulSoup(html_content, "lxml")

    # obtaining tables
    table = soup.find_all('table')

    # indexing the relevant tables to obtain data of interest
    data = pd.read_html(str(table))[0].set_index("Release Date").iloc[:,:-1]
    
    df = df.append(data)
    
df.head(5)

Unnamed: 0_level_0,Symbol,Name,Release Time
Release Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
09/30/2021,PAYX,"Paychex, Inc.",Pre-Market
09/30/2021,KMX,"CarMax, Inc.",Pre-Market
09/30/2021,ANGO,"AngioDynamics, Inc.",Pre-Market
09/30/2021,JEF,Jefferies Financial Group Inc.,Post-Market
09/30/2021,GHGUF,The Go-Ahead Group plc,Estimated


## Obtaining Similar Data but from MarketWatch

In [159]:
url = "https://www.marketwatch.com/tools/earningscalendar"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url, headers=headers).text

# get the HTML content
soup = BeautifulSoup(html_content, "lxml")

# obtaining tables
table = soup.find_all('table')

# indexing the relevant tables to obtain data of interest
data = pd.read_html(str(table))

# 9 contains today
df = data[9].set_index("Symbol")

# list of tickers releasing earnings today
today_earnings = [df.index.values[i] for i in range(len(df.index.values))]

## Retrieving Fundamental Data
To-do:
- For balance sheet and cash flow statement you need to return multiple dataframes, not just one. This is to be done. 

In [259]:
stock = "tsla"

def create_url(stock, statement):
    if statement == "inc":
        url = "https://www.marketwatch.com/investing/stock/{}/financials/income/quarter".format(stock)
        return url 
    
    elif statement == "bs":
        url = "https://www.marketwatch.com/investing/stock/{}/financials/balance-sheet/quarter".format(stock)
        return url
    
    elif statement == "cf":
        url = "https://www.marketwatch.com/investing/stock/{}/financials/cash-flow/quarter".format(stock)
        return url
    
    else:
        print("Invalid statement type. Please specify 'inc', 'bs', or 'cf' for income statement, balance sheet, or cash flow statement.")
        return None
    
def obtain_table_as_df(url):
    
    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url).text

    # get the HTML content
    soup = BeautifulSoup(html_content, "lxml")

    # obtaining tables
    table = soup.find_all('table')

    # income statement: 3
    # balance sheet: 3, 4
    # cash flow: 3, 4, 5

    # indexing the relevant tables to obtain data of interest
    df = pd.read_html(str(table))[3]

    # setting index
    df = df.set_index(df.columns[0])

    # removing last columns (useless data)
    df = df.iloc[:, :-1]

    return df

### Cleaning the Data
#### Functions to clean data 

1) Cleansing column names so that they are not repeated

2) Converting number strings to floats

In [251]:
# 1
def repeated_word(word):
    n = len(word) // 2
    return word[:n] 

def clean_duplicate_rows(df):
    new_idx = []
    
    for string in df.index:
        string = string.replace(" ", "")
        string = repeated_word(string)
        new_idx.append(string)

    df.index = new_idx
    
    return df

# 2
def convert_values_to_int(df):
    
    try:
        for i in range(len(df.values)):
            for j in range(len(df.values[i])):
                
                if df.values[i][j][-1] == "-":
                    pass

                elif df.values[i][j][-1] == "B":
                    df.values[i][j] = int(float(df.values[i][j][:-1]) * 1 * 10**9)

                elif df.values[i][j][-1] == "M":
                    df.values[i][j] = int(float(df.values[i][j][:-1]) * 1 * 10**6)

                elif df.values[i][j][-1] == "K":
                    df.values[i][j] = int(float(df.values[i][j][:-1]) * 1 * 10**3)

                elif df.values[i][j][-2] == "B":
                    df.values[i][j] = int(-float(df.values[i][j][1:-2]) * 1 * 10**9)

                elif df.values[i][j][-2] == "M":
                    df.values[i][j] = int(-float(df.values[i][j][1:-2]) * 1 * 10**6)

                elif df.values[i][j][-2] == "K":
                    df.values[i][j] = int(-float(df.values[i][j][1:-2]) * 1 * 10**3)

                else:
                    pass

        return df
    
    except:
        print("DataFrame already cleaned.")
        return df

#### Using the functions

In [264]:
stock = "tsla" # defining the stock of interest
statement = "cf" # inc, bs, or cf

url = create_url(stock, statement)
raw_df = obtain_table_as_df(url)

df = clean_duplicate_rows(raw_df)        
clean_statement = convert_values_to_int(df)

clean_statement

Unnamed: 0,30-Jun-2020,30-Sep-2020,31-Dec-2020,31-Mar-2021,30-Jun-2021
NetIncomebeforeExtraordinaries,129000000,369000000,296000000,464000000,1180000000
NetIncomeGrowth,-,186.05%,-19.78%,56.76%,153.88%
"Depreciation,Depletion&Amortization",567000000,584000000,618000000,621000000,681000000
DepreciationandDepletion,-,-,-,424000000,461000000
AmortizationofIntangibleAssets,-,-,-,197000000,220000000
DeferredTaxes&InvestmentTaxCredit,-,-,-,-,-
DeferredTaxes,-,-,-,-,-
InvestmentTaxCredit,-,-,-,-,-
OtherFunds,514000000,812000000,863000000,568000000,589000000
FundsfromOperations,1210000000,1770000000,1780000000,1650000000,2450000000
