# ESPI Collect

This module will collect ESPI reports from GPW (Giełda Papierów Wartościowych) website and save relevant information in html and csv format

# 1. Define functions for data parsing

In [17]:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import numpy as np
from IPython.display import HTML, display
import re
from selenium.webdriver.common.keys import Keys
from datetime import datetime
import os

In [18]:
# Function to parse reporting date and return format yyyymmdd
def parse_date(soup):
    creation_date = soup.find('td',text=re.compile(r'Data sporz')).next_sibling.next_sibling.text.strip()
    creation_date = creation_date.replace("-","")
    return creation_date

In [19]:
# Function to parse name of the reporting company
def parse_name(soup):
    name = soup.find('td',text="Nazwa emitenta").next_sibling.next_sibling.text
    name = name.strip().upper()
    name = name.replace('"', '')    
    name = name.replace(" W RESTRUKTURYZACJI", "")
    name = name.replace(" W UPADŁOŚCI LIKWIDACYJNEJ", "")
    name = name.replace(" W UPADŁOŚCI UKŁADOWEJ", "")
    name = name.replace(" W UPADŁOŚCI", "")
    name = name.replace("SPÓŁKA AKCYJA", "SA")
    name = name.replace("SPÓŁKA ACYJNA", "SA")
    name = name.replace("SPÓŁKA AKCJNA", "SA")
    name = name.replace("SPÓLKA AKCYJNA", "SA")
    name = name.replace("SPÓŁKA AKCYJNA", "SA")
    name = name.replace("S. A.","SA")
    name = name.replace("S.A.","SA")
    return name

In [20]:
# Function to get text from the main part of the ESPI report
def parse_url(soup):
    contents = soup.find("div", {'class':'dane'})
    # Index of the report
    section_index = contents.find_all("table", {'class':'nDokument'})[0] 
    # Footer of the report - information about the reporting company
    section_footer = contents.find_all("table", {'class':'nDokument'})[1] 
    link_raport = section_index.find("a")['href']
    link_raport = link_raport[link_raport.find('#')+1:]
    start = contents.find('a',{'name':link_raport})
    content_table = start.parent.next_sibling.next_sibling
    contents_nTekst = content_table.find_all("tr", {'class':'nTekst'})
    total = [(el.text) for el in contents_nTekst]
    return ''.join(total)

In [21]:
def parse_url_and_save(soup):
    contents = soup.find("div", {'class':'dane'})
    return contents

# 2. Collect the ESPI reports from GPW website and save relevant information to htmls files 

In [29]:
start_time = (datetime.now())

driver = webdriver.Chrome()
driver.set_page_load_timeout(30)
    
espi = []
# The range of the id of reports to be scraped from gpw website 
for el in range(308451,310000,1):
    url = 'https://www.gpw.pl/komunikat?geru_id='+str(el)
    if os.path.exists("htmls/"+str(el)+".html")==False:
        try:
            driver.get(url)
            html = driver.page_source
            soup = BeautifulSoup(html)
            with open("htmls/"+str(el)+".html", "w") as file:
                file.write(str(parse_url_and_save(soup)))
        except:
            1==1
            # print("Error at id number: "+str(el))

driver.close()

end_time = (datetime.now())
print(end_time-start_time)

WebDriverException: Message: chrome not reachable
  (Session info: chrome=80.0.3987.100)
  (Driver info: chromedriver=2.41.578700 (2f1ed5f9343c13f73144538f15c00b370eda6706),platform=Linux 5.3.0-40-generic x86_64)


# 3. Parse data from the scraped html files and save it to csv file to be used for further processing and modelling

In [22]:
start_time = datetime.now()

path = './htmls'
files = []
# r=root, d=directories, f=files
for r, d, f in os.walk(path):
    for file in f:
        if '.html' in file:
            files.append(os.path.join(r, file))
files.sort()

dates=[]
names=[]
espi_filename=[]
espi_info=[]
espi_errors=[]
errors=0

for f in files:
    url = str(f)
    try:
        soup = BeautifulSoup(open(f), "html.parser")
        soup_text = parse_url(soup).lower()
        dates.append(parse_date(soup))
        names.append(parse_name(soup))
        espi_info.append(soup_text)
        espi_filename.append(f)
        
    except:
        espi_errors.append(f)
        errors+=1

output = pd.DataFrame(list(zip(espi_filename,dates,names,espi_info)), 
                      columns=['filename','date','name','file_content'])
output.to_csv('espi_data.csv', index=False)

output_errors = pd.DataFrame(list(zip(espi_errors)), 
                      columns=['filename'])
output_errors.to_csv('espi_errors.csv', index=False)

end_time = datetime.now()
time_taken = end_time-start_time
print(f'To process ESPI files it took: {time_taken}')
print(f'Total number of files processed: {len(files)}')
print(f'Total number of ESPI information processed: {len(espi_info)}')
print(f'Total number of errors: {errors}')

To process ESPI files it took: 0:15:35.733726
Total number of files processed: 23447
Total number of ESPI information processed: 22333
Total number of errors: 1114
