## Scraping kauppalehti.fi for OMXH tickers and retrieving the FY data from Yahoo Finance
Import packages

In [1]:
import datetime as dt
import pandas as pd

import concurrent.futures as cf
from yahoofinancials import YahooFinancials

import re
import ast
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

In [2]:
omx_h = 'https://www.kauppalehti.fi/porssi/kurssit/XHEL'

Get list of stocks

In [3]:
# header = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
# }

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
}

## Scraping
Scrape the webpage to find stock tickers. We must use Selenium to open a Chrome browser and scroll down since kauppalehti.fi doesn't load the full html otherwise.

In [10]:
OMXH = []
#res = requests.get(omx_h, headers=header)

# Use Selenium to automate browser actions
driver = webdriver.Chrome()  # Replace if using other than Chrome
driver.get(omx_h)  

# Define the amount of scrolling and the number of iterations
scroll_pixels = 2000  # Adjust if needed
scroll_iterations = 5  # Adjust if needed

# Scroll down multiple times to fetch more content
for i in range(scroll_iterations):
    driver.execute_script(f"window.scrollBy(0, {scroll_pixels});")
    driver.implicitly_wait(1)  # Wait for 1 second between each scroll iteration

# Extract the complete page source
page_source = driver.page_source

# Parse the page source with Beautiful Soup
soup = BeautifulSoup(page_source, "html.parser")

# Close the browser
driver.quit()

divs = soup.findAll('a', {"class": ["tmcl-__sc-18ro7st-3 suYgN", "tmcl-__sc-18ro7st-3 gFUoDG"]})

# print(divs)
#print(soup.prettify())


Find tickers from the html code and construct a list out of them.

In [5]:
OMXH = []

for i, val in enumerate(divs):
    ticker = re.findall(r'/osake/.*["]\s', str(val))[0]   # Find the part of the url after '/osake/' since this contains the ticker.
    ticker = str(ticker).replace('/osake/', '').replace('"', '').strip()    # Delete all the leftovers around the ticker
    name = re.findall(r'>[^><]+[öäå]*<', str(val))[0]   # Same for the company names
    name = str(name).replace('>', '').replace('<', '').strip()
    OMXH.append(ticker)
    

# print(OMXH)
# print(len(OMXH))


Add the .HE suffix and check for duplicates.

In [6]:
stocks = [stock + '.HE' for stock in OMXH]
stocks_set = set(stocks)

contains_duplicates = len(stocks) != len(stocks_set)
if contains_duplicates:
    print('Stock list contains duplicates!')

print(stocks)

['AFAGR.HE', 'AKTIA.HE', 'ALISA.HE', 'ALMA.HE', 'ANORA.HE', 'APETIT.HE', 'ASPO.HE', 'ACG1V.HE', 'ATRAV.HE', 'BIOBV.HE', 'BITTI.HE', 'BOREO.HE', 'CAPMAN.HE', 'CGCBV.HE', 'CAV1V.HE', 'CTY1S.HE', 'CTH1V.HE', 'CONSTI.HE', 'DIGIA.HE', 'DIGIGR.HE', 'DOV1V.HE', 'EEZY.HE', 'ELEAV.HE', 'ELISA.HE', 'PAMPALO.HE', 'ENEDO.HE', 'ENENTO.HE', 'ESENSE.HE', 'EQV1V.HE', 'ERIBR.HE', 'ETTE.HE', 'EVLI.HE', 'EXL1V.HE', 'FSECURE.HE', 'FIA1S.HE', 'FSKRS.HE', 'FORTUM.HE', 'GLA1V.HE', 'GOFORE.HE', 'HARVIA.HE', 'HKSAV.HE', 'HONBS.HE', 'HUH1V.HE', 'ILKKA1.HE', 'ILKKA2.HE', 'ICP1V.HE', 'IFA1V.HE', 'INVEST.HE', 'KAMUX.HE', 'KEMIRA.HE', 'KSLAV.HE', 'KESKOA.HE', 'KESKOB.HE', 'KELAS.HE', 'KHG.HE', 'KOJAMO.HE', 'KNEBV.HE', 'KCR.HE', 'KOSKI.HE', 'KREATE.HE', 'LAT1V.HE', 'LEHTO.HE', 'LL1SPAC.HE', 'MEKKO.HE', 'MARAS.HE', 'METSO.HE', 'METSA.HE', 'METSB.HE', 'MUSTI.HE', 'NESTE.HE', 'NIXU.HE', 'NOHO.HE', 'NOKIA.HE', 'TYRES.HE', 'NDA FI.HE', 'NLG1V.HE', 'OLVAS.HE', 'OMASP.HE', 'OPTOMED.HE', 'OKDAV.HE', 'OKDBV.HE', 'ORNAV.HE', 

## Retrieving the data
Use Yahoo Finance to retrieve the companies' FY (Fiscal Year) data.

In [7]:
balanceSheet = {}
incomeStatement = {}
cashStatement = {}

def retrieve_stock_data(stock):

    try:
        print(stock)
        yahoo_financials = YahooFinancials(stock)
        balance_sheet_data = yahoo_financials.get_financial_stmts('annual', 'balance')
        income_sheet_data = yahoo_financials.get_financial_stmts('annual', 'income')
        cash_sheet_data = yahoo_financials.get_financial_stmts('annual', 'cash')
        # print(balance_sheet_data)

        balanceSheet[stock] = balance_sheet_data['balanceSheetHistory'][stock]
        incomeStatement[stock] = income_sheet_data['incomeStatementHistory'][stock]
        cashStatement[stock] = cash_sheet_data['cashflowStatementHistory'][stock]
    except:
        print('Error retrieving stock data for {stock}')

    return      

## Multithreading

In [8]:
start = time.time()

executor = cf.ThreadPoolExecutor(16)

futures = [executor.submit(retrieve_stock_data, stock) for stock in stocks]
cf.wait(futures)

end = time.time()
print(f'Time taken: {end-start}s')

AFAGR.HE
AKTIA.HE
ALISA.HE
ALMA.HE
ANORA.HE
APETIT.HE
ASPO.HE
ACG1V.HE
ATRAV.HE
BIOBV.HE
BITTI.HE
BOREO.HE
CAPMAN.HE
CGCBV.HE
CAV1V.HE
CTY1S.HE
CTH1V.HE
CONSTI.HE
DIGIA.HE
DIGIGR.HE
DOV1V.HE
EEZY.HE
ELEAV.HE
ELISA.HE
PAMPALO.HE
ENEDO.HE
ENENTO.HE
ESENSE.HE
EQV1V.HE
ERIBR.HE
ETTE.HE
EVLI.HE
EXL1V.HE
FSECURE.HE
FIA1S.HE
FSKRS.HE
FORTUM.HE
GLA1V.HE
GOFORE.HE
HARVIA.HE
HKSAV.HE
HONBS.HE
HUH1V.HE
ILKKA1.HE
ILKKA2.HE
ICP1V.HE
IFA1V.HE
INVEST.HE
KAMUX.HE
KEMIRA.HE
KSLAV.HE
KESKOA.HE
KESKOB.HE
KELAS.HE
KHG.HE
KOJAMO.HE
KNEBV.HE
KCR.HE
KOSKI.HE
KREATE.HE
LAT1V.HE
LEHTO.HE
LL1SPAC.HE
MEKKO.HE
MARAS.HE
METSO.HE
METSA.HE
METSB.HE
MUSTI.HE
NESTE.HE
NIXU.HE
NOHO.HE
NOKIA.HE
TYRES.HE
NDA FI.HE
NLG1V.HE
OLVAS.HE
OMASP.HE
OPTOMED.HE
OKDAV.HE
OKDBV.HE
ORNAV.HE
ORNBV.HE
ORTHEX.HE
OUT1V.HE
OVARO.HE
PNA1V.HE
PIHLIS.HE
PON1V.HE
PUMU.HE
PURMO.HE
PUUILO.HE
QPR1V.HE
QTCOM.HE
RAIKV.HE
RAIVV.HE
RAP1V.HE
RAUTE.HE
REKA.HE
RELAIS.HE
REMEDY.HE
REG1V.HE
ROBIT.HE
ROVIO.HE
SAGCV.HE
SAMPO.HE
SANOMA.HE
SCANFL.HE
SIILI.HE

Save FY data to a file

In [9]:
with open('balanceSheet_OMXH.txt', 'w') as output:
    output.write(str(balanceSheet))
with open('incomeStatement_OMXH.txt', 'w') as output:
    output.write(str(incomeStatement))
with open('cashStatement_OMXH.txt', 'w') as output:
    output.write(str(cashStatement))