In [226]:
import fitz
import re
import pandas as pd

def read_pdf_file(file_path, start_page=1, end_page=100):
    pdf_document = fitz.open(file_path)
    total_pages = pdf_document.page_count
    pdf_text = ''

    for page_num in range(start_page - 1, end_page):
        pdf_page = pdf_document.load_page(page_num)
        pdf_text += pdf_page.get_text()

    return pdf_text



def format_data(extracted_data):
    data = {}
    lines = extracted_data.split('\n')
    
    for i, line in enumerate(lines):
        numerical_data = re.findall(r'(-?\d{1,3}(?:\s\d{3})*\s\d{3})', line)
        numerical_data = ' '.join(numerical_data).replace(' ', '')
        numerical_data = int(numerical_data) if numerical_data else None
        
        if numerical_data:
            variable_name = None
            
            if i > 0 and not re.search(r'\d', lines[i-1]):
                variable_name = lines[i-1]
            elif i < len(lines) - 1 and not re.search(r'\d', lines[i+1]):
                variable_name = lines[i+1]
            
            if variable_name is not None:
                data[variable_name] = numerical_data
    
    
    split_key_first = 'Årsresultat etter minoritetsinteresser'
    split_key_second = 'Årsresultat'
    
    # check if the first split key is in the data
    
    if split_key_first in data.keys():
        split_key = split_key_first
    else:
        split_key = split_key_second
    
    
    
    keys = list(data.keys())
    split_index = keys.index(split_key) + 1
    
    income_statement, balance_sheet = {k: data[k] for k in keys[:split_index]}, {k: data[k] for k in keys[split_index:]}
    
    return income_statement, balance_sheet


def get_data(file_path, start_page=1, end_page=100):
    extracted_data = read_pdf_file(file_path, start_page, end_page)
    income_statement, balance_sheet = format_data(extracted_data)
    
    return income_statement, balance_sheet

In [227]:
year = 2014

file_path = f'elkem_data/pdf/{year}.pdf'



start_page=6
end_page=11
pdf_data = read_pdf_file(file_path,
                         start_page=start_page,
                         end_page=end_page
                         )

data=format_data(pdf_data)
data

({'Salgsinntekt': 8561833000,
  'Annen driftsinntekt': 90519000,
  'Sum inntekter': 8652352000,
  'Kostnader': 7788724000,
  'Varekostnad': 4251156000,
  'Lønnskostnad': 1358684000,
  'Avskrivning på varige driftsmidler og immaterielle eiendeler': 394347000,
  'Nedskrivning av varige driftsmidler og immaterielle eiendeler': 730000,
  'Other operating expences': 1982864000,
  'Other gains and losses': 53155000,
  'Sum kostnader': 8040936000,
  'Driftsresultat': 611416000,
  'Finansinntekter og finanskostnader': 417621000,
  'Inntekt på investering i datterselskap og tilknyttet selskap': -262045000,
  'Sum finansinntekter': -262045000,
  'Finance expences net': -42953000,
  'Sum finanskostnader': -42953000,
  'Netto finans': -219092000,
  'Ordinært resultat før skattekostnad': 392324000,
  'Skattekostnad på ordinært resultat': 169802000,
  'Ordinært resultat etter skattekostnad': 222522000,
  'Profit for the year from discontinued operations': 80181000,
  'Årsresultat': 302703000,
  'Min

In [236]:
years = range(2014, 2022)
import json

all_years = {}

for year in years:
    
    input_path = f'elkem_data/pdf/{year}.pdf'
    output_path = f'elkem_data/json/{year}.json'
    
    income_statement, balance_sheet = get_data(input_path, start_page=6, end_page=11)
    
    with open(output_path, 'w') as f:
        json.dump({'income_statement': income_statement, 'balance_sheet': balance_sheet}, f, indent=4)
    
    all_years[year] = {'income_statement': income_statement, 'balance_sheet': balance_sheet}
    
    