In [6]:
#SCRAPE MULTIPLE FINANCIAL PARAMETERS FROM SINGLE PAGE / V2 / Faster scrolling using drag&drop apprach
#https://stackoverflow.com/questions/62119348/how-to-scroll-horizontally-using-selenium-chromedriver-in-python

import pandas as pd
import matplotlib.pyplot as plt
import time
import requests
import json
from html.parser import HTMLParser
import sqlite3


In [92]:
#scrape the data

#HTML parser that help to parse html-strings
#https://www.educative.io/answers/what-is-the-html-parser-in-python
class Parser(HTMLParser):
  def handle_data(self, data):
    self.output = data
parser = Parser()


class ScrapeMacrotrend():

    def __init__(self, ticker_search, scrape_again=False):
        
        url = self.get_company_page_url(ticker_search) #find url for the respective company based on ticker-name
        # url = 'https://www.macrotrends.net/stocks/charts/TSLA/tesla/income-statement?freq=Q'

        #scrape and store company quarter data if not available in database. If data already exists in database, return it from there.
        if self.data_already_in_database(name= (ticker_search + '_financial_statements_quarter'))==False or scrape_again==True:
            self.financial_statements_quarter = self.scrape_the_data(url + '/income-statement?freq=Q')
            self.store_data_in_database(data=self.financial_statements_quarter, name= (ticker_search + '_financial_statements_quarter'))
        else:
            self.financial_statements_quarter = self.get_from_database(name= (ticker_search + '_financial_statements_quarter'))

        if self.data_already_in_database(name= (ticker_search + '_balance_sheet_quarter'))==False or scrape_again==True:
            self.balance_sheet_quarter = self.scrape_the_data(url + '/balance-sheet?freq=Q')
            self.store_data_in_database(data=self.balance_sheet_quarter, name= (ticker_search + '_balance_sheet_quarter'))
        else:
            self.balance_sheet_quarter = self.get_from_database(name= (ticker_search + '_balance_sheet_quarter'))

        if self.data_already_in_database(name= (ticker_search + '_cash_flow_quarter'))==False or scrape_again==True:
            self.cash_flow_quarter = self.scrape_the_data(url + '/cash-flow-statement?freq=Q')
            self.store_data_in_database(data=self.cash_flow_quarter, name= (ticker_search + '_cash_flow_quarter'))
        else:
            self.cash_flow_quarter = self.get_from_database(name= (ticker_search + '_cash_flow_quarter'))


        if self.data_already_in_database(name= (ticker_search + '_financial_ratios_quarter'))==False or scrape_again==True:
            self.financial_ratios_quarter = self.scrape_the_data(url + '/financial-ratios?freq=Q')
            self.store_data_in_database(data=self.financial_ratios_quarter, name= (ticker_search + '_financial_ratios_quarter'))
        else:
            self.financial_ratios_quarter = self.get_from_database(name= (ticker_search + '_financial_ratios_quarter'))
        
        #scrape company anual data
        if self.data_already_in_database(name= (ticker_search + '_financial_statements_annual'))==False or scrape_again==True:
            self.financial_statements_annual = self.scrape_the_data(url + '/income-statement?freq=A')
            self.store_data_in_database(data=self.financial_statements_annual, name= (ticker_search + '_financial_statements_annual'))
        else:
            self.financial_statements_annual = self.get_from_database(name= (ticker_search + '_financial_statements_annual'))

        if self.data_already_in_database(name= (ticker_search + '_balance_sheet_annual'))==False or scrape_again==True:
            self.balance_sheet_annual = self.scrape_the_data(url + '/balance-sheet?freq=A')
            self.store_data_in_database(data=self.balance_sheet_annual, name= (ticker_search + '_balance_sheet_annual'))
        else:
            self.balance_sheet_annual = self.get_from_database(name= (ticker_search + '_balance_sheet_annual'))

        if self.data_already_in_database(name= (ticker_search + '_cash_flow_annual'))==False or scrape_again==True:
            self.cash_flow_annual = self.scrape_the_data(url + '/cash-flow-statement?freq=A')
            self.store_data_in_database(data=self.cash_flow_annual, name= (ticker_search + '_cash_flow_annual'))
        else:
            self.cash_flow_annual = self.get_from_database(name= (ticker_search + '_cash_flow_annual'))

        if self.data_already_in_database(name= (ticker_search + '_financial_ratios_annual'))==False or scrape_again==True:
            self.financial_ratios_annual = self.scrape_the_data(url + '/financial-ratios?freq=A')
            self.store_data_in_database(data=self.financial_ratios_annual, name= (ticker_search + '_financial_ratios_annual'))
        else:
            self.financial_ratios_annual = self.get_from_database(name= (ticker_search + '_financial_ratios_annual'))


    def get_company_page_url(self, ticker_search):
        #get ticker data
        tickers_url = 'https://www.macrotrends.net/assets/php/ticker_search_list.php?_=1664947632720'
        data_tickers = requests.get(tickers_url)
        data_tickers = data_tickers.text
        data_tickers_json = json.loads(data_tickers)

        #search for the company-url
        for item in data_tickers_json:
            # print(item['s'])
            if ticker_search in item['s']: #example: AAPL/apple
                company_page_url = item['s']
                # print("company-url={}".format(company_page_url))

        return 'https://www.macrotrends.net/stocks/charts/' + company_page_url


    def scrape_the_data(self, url):

        page = requests.get(url)
        page_lines = page.text.splitlines()
        for line in page_lines:
            if 'var originalData =' in line: #the line where all the data is storred
                data = line #store the information in a variable

        data = data[20:-1] # remove the the unneeded informatino from the line
        data_json = json.loads(data) #convert the line to a json-object

        #the following vars are needed when looping through the json_
        data_index = [] #collect the data-names
        data_column = [] #collect the data-dates
        data_values = [] #collect the data-values

        #loop over 
        for item in data_json:

            #reset temp-vars for the next loop
            data_values_temp = [] #store data-values for current loop
            data_column_temp = [] #store data-dates for current loop

            for key in item.items() :
                # print(key[0], key[1])

                if (key[0] != 'field_name') and (key[0] != 'popup_icon'): # dates and values
                    # print(key[0], key[1])
                    data_column_temp.append(key[0]) #dates
                    data_values_temp.append(key[1]) #values

                elif key[0] == 'field_name': #data-names
                    # print(key[1])
                    parser.feed(key[1])
                    # print(parser.output)
                    data_index.append(parser.output) # xxx = "<a href='/stocks/charts/TSLA/tesla/cost-goods-sold'>Cost Of Goods Sold</a>"

                elif key[0]== 'popup_icon': #data-graph link (not needed)
                    continue
                    print(key[1])

            #add temp-vars to permanent ones
            data_values.append(data_values_temp)
            data_column.append(data_column_temp)


        # pd.DataFrame(data=data_values, index=data_index, columns=data_column, dtype=None, copy=None)
        company_data = pd.DataFrame(data=data_values, index=data_index, columns=data_column[0])
        company_data.apply(self.fixData)

        return company_data

    #apply-function that goes over all dataframe-elements and converts them to numeric value if possible
    def fixData(sekf, input_data):
        
        for i in range(len(input_data)):
        
            try:
                input_data[i] = pd.to_numeric(input_data[i])
                # print(type(input_data[i]), "converted to numeric")

            except:
                # pass
                print(input_data[i], "Can't convert to numeric")
            
        return input_data
        

    def store_data_in_database(self, data, name):
        database = "company_database.db"
        conn = sqlite3.connect(database)
        data.to_sql(name=name, con=conn, if_exists='replace')
        conn.close()


    def data_already_in_database(self, name):

        output = False
        conn = sqlite3.connect('company_database.db')
        c = conn.cursor()
                    
        #get the count of tables with the name
        c.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}' '''.format(name))

        #if the count is 1, then table exists
        if c.fetchone()[0]==1:
            # print('Table exists.', name)
            output = True
			
        #commit the changes to db			
        conn.commit()
        #close the connection
        conn.close()

        # print(name, output)
        return output

    def get_from_database(self, name):
        conn = sqlite3.connect('company_database.db')
        output = pd.read_sql_query(f"SELECT * from {name}", conn) #get the data from database and put it in pandas-dataframe
        output_index = output.set_index('index', inplace=False) #set the index-column as dataframe-index, needed because otherwise the dataframe index won't be parameter names but integers
        # print(output)
        # print(output1)
        conn.close()
        return output_index

    # company_data.dtypes
    # company_data.columns
    # company_data.index

    # company_data.loc['Revenue']
    # company_data.loc['EPS - Earnings Per Share']

    # company_data.columns = pd.to_datetime(company_data.columns) #change columns data-type to datetime.
    # company_data.columns




In [95]:
apple_data = ScrapeMacrotrend('AAPL')
tesla_data = ScrapeMacrotrend('TSLA')
microsoft_data = ScrapeMacrotrend('MSFT')
amazon_data = ScrapeMacrotrend(ticker_search='AMZN')

# apple_data1 = ScrapeMacrotrend('AAPL')
# tesla_data1 = ScrapeMacrotrend('TSLA')
# microsoft_data1 = ScrapeMacrotrend('MSFT')
# amazon_data1 = ScrapeMacrotrend(ticker_search='AMZN')

#tesla_data.financial_statements_annual
# print(amazon_data.financial_statements_quarter.index.to_list())
# print(amazon_data1.financial_statements_quarter.index.to_list())

# amazon_data.financial_statements_quarter.columns

# amazon_data2 = amazon_data1.financial_statements_quarter.copy()
# print(amazon_data2.columns)
# amazon_data2.drop('index', axis=1, inplace=True) 
# amazon_data2.set_index('index', inplace=True)
# print(amazon_data2.columns)

# amazon_data.financial_statements_quarter
# amazon_data1.financial_statements_quarter
# amazon_data2

# amazon_data3 = ScrapeMacrotrend(ticker_search='AMZN')
# amazon_data3.financial_statements_quarter


Unnamed: 0_level_0,2022-06-30,2022-03-31,2021-12-31,2021-09-30,2021-06-30,2021-03-31,2020-12-31,2020-09-30,2020-06-30,2020-03-31,...,2011-06-30,2011-03-31,2010-12-31,2010-09-30,2010-06-30,2010-03-31,2009-12-31,2009-09-30,2009-06-30,2009-03-31
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Revenue,121234.0,116444.0,137412.0,110812.0,113080.0,108518.0,125555.0,96145.0,88912.0,75452.0,...,9913.0,9857.0,12947.0,7560.0,6566.0,7131.0,9520.0,5449.0,4651.0,4889.0
Cost Of Goods Sold,66424.0,66499.0,82835.0,62930.0,64176.0,62403.0,79284.0,57106.0,52660.0,44257.0,...,7525.0,7608.0,10317.0,5786.0,4957.0,5501.0,7543.0,4176.0,3518.0,3741.0
Gross Profit,54810.0,49945.0,54577.0,47882.0,48904.0,46115.0,46271.0,39039.0,36252.0,31195.0,...,2388.0,2249.0,2630.0,1774.0,1609.0,1630.0,1977.0,1273.0,1133.0,1148.0
Research And Development Expenses,18072.0,14842.0,15313.0,14380.0,13871.0,12488.0,12051.0,10976.0,10388.0,9325.0,...,698.0,579.0,518.0,442.0,408.0,366.0,351.0,315.0,299.0,275.0
SG&A Expenses,33331.0,31185.0,35780.0,28661.0,27320.0,24724.0,27844.0,21807.0,19731.0,17811.0,...,1448.0,1315.0,1609.0,1038.0,906.0,844.0,1129.0,698.0,615.0,618.0
Other Operating Income Or Expenses,-90.0,-249.0,-24.0,11.0,-11.0,-38.0,497.0,-62.0,-290.0,-70.0,...,-41.0,-33.0,-29.0,-26.0,-25.0,-26.0,-22.0,-9.0,-60.0,-11.0
Operating Expenses,117917.0,112775.0,133952.0,105960.0,105378.0,99653.0,118682.0,89951.0,83069.0,71463.0,...,9712.0,9535.0,12473.0,7292.0,6296.0,6737.0,9045.0,5198.0,4492.0,4645.0
Operating Income,3317.0,3669.0,3460.0,4852.0,7702.0,8865.0,6873.0,6194.0,5843.0,3989.0,...,201.0,322.0,474.0,268.0,270.0,394.0,475.0,251.0,159.0,244.0
Total Non-Operating Income/Expense,-5970.0,-8934.0,11474.0,-537.0,932.0,1403.0,892.0,615.0,378.0,-606.0,...,24.0,-15.0,33.0,24.0,27.0,7.0,-3.0,11.0,20.0,4.0
Pre-Tax Income,-2653.0,-5265.0,14934.0,4315.0,8634.0,10268.0,7765.0,6809.0,6221.0,3383.0,...,225.0,307.0,507.0,292.0,297.0,401.0,472.0,262.0,179.0,248.0


In [None]:
#plotting the data
plt.figure(figsize=(15,5)) #set figure size

# tesla_data.financial_ratios_quarter.loc['Current Ratio'].plot()
# apple_data.financial_ratios_quarter.loc['Current Ratio'].plot()

# tesla_data.financial_statements_annual.loc['Revenue'].plot()
# apple_data.financial_statements_annual.loc['Revenue'].plot()
microsoft_data.financial_statements_annual.loc['Revenue'].plot()
amazon_data.financial_statements_annual.loc['Revenue'].plot()
plt.plot()

#invert x-axis (2009 on the left, 2022 on the right)
ax = plt.gca()
ax.invert_xaxis()

plt.show()