pip3 install requests

pip3 install pandas

pip3 install -U pypdfium2

pip3 install google-generativeai

In [1]:
import requests
import pandas as pd
import pypdfium2 as pdfium
import google.generativeai as genai
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
company_name = 'Barclays Plc'
fin_year = '2023'

In [4]:
temp_file_loc = '/Users/yxiao/Downloads'
CH_API_KEY = os.environ['CH_API_KEY']
GEMINI_AI_API_KEY = os.environ['GEMINI_AI_API_KEY']

In [5]:
genai.configure(api_key = GEMINI_AI_API_KEY)


def get_company_number(company_name):
    '''
    Retrive the company number with the company name, using the Companies House API
    '''
    # search manually on CH via CH API
    search_url = f'https://api.company-information.service.gov.uk/search/companies?q={company_name}'
    response = requests.get(search_url, auth=(CH_API_KEY, ''), params={'items_per_page': 100}).json()
    for item in response['items']:
        if item['title'] == company_name.upper():
            return item['company_number']
    print('No match found')
    return None
    
        

def validate_company_number(company_name, company_number):
    '''
    Utility function to check if the company number matches with the company name on CH. 
    Not used if get_company_number() is used for the retrieval.
    '''
    company_url = f'https://api.company-information.service.gov.uk/company/{company_number}'
    response = requests.get(company_url, auth=(CH_API_KEY, '')).json()
    if response['company_name'] == company_name.upper():
        return True
    return False
        


def download_accounts(company_number, company_name, fin_year):
    '''
    Download the financial accounts for the company as of a given reporting year
    '''
    print(f'Retriving filing account for {company_name} (Company Number: {company_number}) for Year {fin_year}...')
    # get accounts history (up to 100 records)
    filing_history_url = f'https://api.company-information.service.gov.uk/company/{company_number}/filing-history'
    response = requests.get(filing_history_url,
                            params={'category': 'accounts', 'items_per_page': 100},
                            auth=(CH_API_KEY, ''))
    filing_history = response.json()

    output_loc = os.path.join(temp_file_loc, company_name)
    if not os.path.exists(output_loc):
        os.makedirs(output_loc)
        
    # download financial statements
    for item in filing_history['items']:
        document_date = item['description_values']['made_up_date']

        if pd.to_datetime(document_date).year != int(fin_year):
            continue

        document_id = item['links']['document_metadata'].split('/')[-1]
        document_url = f'https://document-api.company-information.service.gov.uk/document/{document_id}/content'
        # Download the document
        document_response = requests.get(document_url, auth=(CH_API_KEY, ''))
        if document_response.headers['Content-Type'] == 'application/pdf':
            with open(f'{output_loc}/{company_name}_financial_statement_{document_date}.pdf', 'wb') as f:
                f.write(document_response.content)
                print(f"Downloaded {company_name}_financial_statement_{document_date}.pdf")
                return document_date

        elif document_response.headers['Content-Type'] == 'application/xhtml+xml':
            with open(f'{output_loc}/{company_name}_financial_statement_{document_date}.xhtml', 'wb') as f:
                f.write(document_response.content)
                print(f"Downloaded {company_name}_financial_statement_{document_date}.xhtml")
                return document_date
    return None


In [6]:
def convert_pdf_to_img(pdf_file_path, pages: int|list|str = 10):
    '''
    Convert pdf to image for gen AI processing
    '''
    pdf = pdfium.PdfDocument(pdf_file_path)
    
    n_page = len(pdf)
    if pages == 'All':
        pages = range(1, n_page + 1)
    elif isinstance(pages, int):
        pages = min(n_page, pages)
        pages = list(range(1, pages + 1))

    for page_number in pages:
        # get_page consumes zero-indexed page number
        page = pdf.get_page(page_number - 1)
        pil_image = page.render(scale=300/72).to_pil()
        file_path = f"{temp_file_loc}/{company_name}/image_{page_number}.png"
        pil_image.save(file_path)
    return pages


def extract_fin_statement_page_from_toc(doc_date, pages: int|list|str = 10):
    pages = convert_pdf_to_img(f"{temp_file_loc}/{company_name}/{company_name}_financial_statement_{doc_date}.pdf", pages)

    for page_number in pages:
        file_path = f"{temp_file_loc}/{company_name}/image_{page_number}.png"
        sample_file = genai.upload_file(path = file_path)
        file = genai.get_file(name=sample_file.name)
        model = genai.GenerativeModel(model_name="gemini-1.5-pro")
        # Prompt the model with text and the previously uploaded image.
        response = model.generate_content([sample_file, "Analyze the given image and carefully inspect if this page contains a table of content. If so, answer only the page number to the financial statement. Otherwise, answer 'No' only. "])
        if response.text and response.text != 'No':
            print(f'Information found on page {page_number}')
            print(f"Financial Statement Page: {response.text}")
            return response.text
    print(f"Failed to extract table of content information from the pages {', '.join(map(str, pages))}.")
    return None


def extract_financials_from_statement(financials, fin_statement_page, doc_date):
    if fin_statement_page:
        # we can adjust the range of the search. Currently set to search in 30 pages from the start of the section.
        pages = list(range(int(fin_statement_page), int(fin_statement_page) + 30))
    else:
        pages = 'All'
    pages = convert_pdf_to_img(f"{temp_file_loc}/{company_name}/{company_name}_financial_statement_{doc_date}.pdf", pages)

    for page_number in pages:
        file_path = f"{temp_file_loc}/{company_name}/image_{page_number}.png"
        sample_file = genai.upload_file(path = file_path)
        file = genai.get_file(name=sample_file.name)
        model = genai.GenerativeModel(model_name="gemini-1.5-pro")
        # Prompt the model with text and the previously uploaded image.
        response = model.generate_content([sample_file, f"Analyze the given image and carefully extract {financials} information as of {fin_year}. If found, answer this number only. Otherwise, answer only 'Not found'. "])
        if response.text and 'Not found' not in response.text:
            print(f'Information found on page {page_number}')
            print(f"{financials}: {response.text}")
            return response.text
    print(f"Failed to extract financials from the pages {', '.join(map(str, pages))}.")
    return None
    

In [8]:
def get_company_financials(company_name, fin_year, financials):
    company_number = get_company_number(company_name)
    doc_date = download_accounts(company_number, company_name, fin_year)

    if doc_date:
        fin_statement_page = extract_fin_statement_page_from_toc(doc_date)
    else:
        print(f'Unable to retrieve account filed for {company_name} as of Year {fin_year}.')
        return None
    
    fin = extract_financials_from_statement(financials, fin_statement_page, doc_date)

In [9]:
get_company_financials(company_name, fin_year, 'Total Assets')

Retriving filing account for Barclays Plc (Company Number: 00048839) for Year 2023...
Downloaded Barclays Plc_financial_statement_2023-12-31.pdf
Information found on page 4
Financial Statement Page: 394
Information found on page 394
Total Assets: The provided text is a table of contents from Barclays' 2023 Annual Report, which directs to the pages containing different financial statements and disclosures. It doesn't directly show the total assets value.  Therefore, the answer is "Not found".
