In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import NoSuchElementException

import undetected_chromedriver as uc
from webdriver_manager.chrome import ChromeDriverManager

# from bs4 import BeautifulSoup

import os
import time
import requests
import warnings
import unicodedata
import pandas as pd
from tqdm import tqdm
from random import randint

warnings.filterwarnings('ignore')

chrome_options = uc.ChromeOptions()

chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-infobars');
chrome_options.add_argument("--safebrowsing-disable-download-protection")
chrome_options.add_argument("safebrowsing-disable-extension-blacklist")
chrome_options.add_argument("--disable-javascript")
chrome_options.add_argument("--start-maximized")
# Adding argument to disable the AutomationControlled flag 
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
 
# Exclude the collection of enable-automation switches 
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) 

# Turn-off userAutomationExtension 
chrome_options.add_experimental_option("useAutomationExtension", False)

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

actions = ActionChains(driver)

In [2]:
def crawlPDF(company_number, cursor, number_of_companies):

    data = []

    driver.get(f"https://find-and-update.company-information.service.gov.uk/company/{company_number}/filing-history")

    time.sleep(3)

    driver.execute_script("window.scrollBy(0, 500);")  # Scrolls down 500px

    # Get company name and company number
    try:
        company_name = driver.find_element(By.CLASS_NAME, "heading-xlarge").text.strip().replace('\\', '').replace('/', '-')
        company_number = driver.find_element(By.ID, "company-number").text.strip()

        folder_path = f"RESULT/{company_number}_{company_name}"

        print(f"Processing company number: {cursor}/{number_of_companies} - {company_number} - {company_name}\n")

        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            tqdm.write(f"Folder created: {folder_path}")
        else:
            tqdm.write(f"Folder already exists: {folder_path}")

        # Filter for accounts
        checkbox = driver.find_element(By.ID, "filter-category-accounts")
        if not checkbox.is_selected():
            # Click the checkbox to select it
            checkbox.click()
    except NoSuchElementException:
        tqdm.write(f"Company {company_number} not found or does not have a valid filing history page.")
        return []

    # Loop through pages until there are no more pages to scrape
    page_number = 1 
    while True:
        try:

            time.sleep(2)

            table = driver.find_element(By.ID, "fhTable")

            rows = table.find_element(By.TAG_NAME, "tbody").find_elements(By.TAG_NAME, "tr")
            for row in tqdm(rows[1:], desc=f"Scraping page {page_number}"):  # Skip the header row
                columns = row.find_elements(By.TAG_NAME, "td")
                if len(columns) > 0:
                    date = columns[0].text.strip()
                    description = columns[2].text.strip()
                    if ("Full accounts".lower() in description.lower()) and "Request Document" not in columns[3].find_element(By.TAG_NAME, "a").text.strip() :
                        link = columns[3].find_element(By.TAG_NAME, "a").get_attribute("href")
                    else:
                        link = 'None'

                    if link != 'None':
                        while True:
                            response = requests.get(link)
                            if response.status_code == 200:
                                with open(f"{folder_path}/{date}_{description}.pdf", "wb") as f:
                                    f.write(response.content)
                                break
                            else:
                                tqdm.write(f"Failed to download. Status code: {response.status_code}")
                                tqdm.write("Retrying in 5 seconds...")
                                time.sleep(5)
                    
                    data.append({
                        'Company Name': company_name,
                        'Company Number': company_number,
                        'Date': date,
                        'Description': description,
                        'Link': link
                    })

            
            time.sleep(2)
            next_page = driver.find_element(By.CLASS_NAME, "govuk-pagination__next")
            next_page.click()
            tqdm.write("Next page clicked.")
            page_number += 1

        except:
            tqdm.write("No more pages to scrape.\n\n")
            break

    # Save the data to a CSV file
    pd.DataFrame(data).to_csv(f"{folder_path}/Filling_History_{company_number}_{company_name}.csv", index=False)

    return data


FULL_DATA = []
cursor = 1
company_list = pd.read_excel('JPG-to-excel_vpM.xlsx')
company_number_list = company_list['company_number']
for company in company_number_list:
    try:
        FULL_DATA.extend(crawlPDF(company, cursor, len(company_number_list)))
        cursor += 1
    except Exception as e:
        tqdm.write(f"Error processing company {company}: {e}")
        continue

pd.DataFrame(FULL_DATA).to_csv('RESULT/Filling_History.csv', index=False)

Processing company number: 1/25 - Company number 02719242 - SONY PICTURES TELEVISION UK RIGHTS LIMITED

Folder created: RESULT/Company number 02719242_SONY PICTURES TELEVISION UK RIGHTS LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:04<00:00,  2.58s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 12/12 [00:22<00:00,  1.91s/it]


No more pages to scrape.


Processing company number: 2/25 - Company number 02829355 - SONY INTERACTIVE ENTERTAINMENT UK LIMITED

Folder created: RESULT/Company number 02829355_SONY INTERACTIVE ENTERTAINMENT UK LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:04<00:00,  2.58s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 14/14 [00:22<00:00,  1.62s/it]


No more pages to scrape.


Processing company number: 3/25 - Company number 03277793 - SONY INTERACTIVE ENTERTAINMENT EUROPE LIMITED

Folder created: RESULT/Company number 03277793_SONY INTERACTIVE ENTERTAINMENT EUROPE LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:03<00:00,  2.55s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 11/11 [00:00<00:00, 33.22it/s]


No more pages to scrape.


Processing company number: 4/25 - Company number 03754597 - SONY INTERACTIVE ENTERTAINMENT DIRECT EUROPE LIMITED

Folder created: RESULT/Company number 03754597_SONY INTERACTIVE ENTERTAINMENT DIRECT EUROPE LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:14<00:00,  2.97s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


No more pages to scrape.


Processing company number: 5/25 - Company number 03602378 - SONY ENTERTAINMENT HOLDINGS EUROPE LIMITED

Folder created: RESULT/Company number 03602378_SONY ENTERTAINMENT HOLDINGS EUROPE LIMITED


Scraping page 1: 100%|██████████| 25/25 [00:47<00:00,  1.90s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 9/9 [00:02<00:00,  3.64it/s]


No more pages to scrape.


Processing company number: 6/25 - Company number 02351702 - SONY CORPORATE SERVICES EUROPE LIMITED

Folder created: RESULT/Company number 02351702_SONY CORPORATE SERVICES EUROPE LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:11<00:00,  2.86s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 15/15 [00:03<00:00,  4.80it/s]


No more pages to scrape.


Processing company number: 7/25 - Company number 03236308 - SONY PICTURES TELEVISION PRODUCTION UK LIMITED

Folder created: RESULT/Company number 03236308_SONY PICTURES TELEVISION PRODUCTION UK LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:30<00:00,  3.62s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 7/7 [00:05<00:00,  1.33it/s]


No more pages to scrape.


Processing company number: 8/25 - Company number FC035527 - SONY EUROPE B.V.

Folder created: RESULT/Company number FC035527_SONY EUROPE B.V.


Scraping page 1: 100%|██████████| 6/6 [00:26<00:00,  4.38s/it]


No more pages to scrape.


Processing company number: 9/25 - Company number 15990239 - SONY UK TECHNOLOGY CENTRE LIMITED

Folder created: RESULT/Company number 15990239_SONY UK TECHNOLOGY CENTRE LIMITED


Scraping page 1: 100%|██████████| 1/1 [00:00<00:00, 37.03it/s]


No more pages to scrape.


Processing company number: 10/25 - Company number 01326236 - SONY MUSIC ENTERTAINMENT EURODISC LIMITED

Folder created: RESULT/Company number 01326236_SONY MUSIC ENTERTAINMENT EURODISC LIMITED


Scraping page 1: 100%|██████████| 25/25 [00:31<00:00,  1.28s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 23/23 [00:26<00:00,  1.15s/it]


No more pages to scrape.


Processing company number: 11/25 - Company number 04965804 - SONY MUSIC PUBLISHING (UK) LIMITED

Folder created: RESULT/Company number 04965804_SONY MUSIC PUBLISHING (UK) LIMITED


Scraping page 1: 100%|██████████| 22/22 [01:06<00:00,  3.02s/it]


No more pages to scrape.


Processing company number: 12/25 - Company number 15698470 - SONY EUROPE LIMITED

Folder created: RESULT/Company number 15698470_SONY EUROPE LIMITED


Scraping page 1: 100%|██████████| 1/1 [00:00<00:00, 25.26it/s]


No more pages to scrape.


Processing company number: 13/25 - Company number 06020283 - SONY INTERACTIVE ENTERTAINMENT NETWORK EUROPE LIMITED

Folder created: RESULT/Company number 06020283_SONY INTERACTIVE ENTERTAINMENT NETWORK EUROPE LIMITED


Scraping page 1: 100%|██████████| 18/18 [00:46<00:00,  2.60s/it]


No more pages to scrape.


Processing company number: 14/25 - Company number 05640889 - SONY DADC UK LIMITED

Folder created: RESULT/Company number 05640889_SONY DADC UK LIMITED


Scraping page 1: 100%|██████████| 19/19 [00:55<00:00,  2.93s/it]


No more pages to scrape.


Processing company number: 15/25 - Company number 06583810 - SONY MUSIC PUBLISHING EUROPE LIMITED

Folder created: RESULT/Company number 06583810_SONY MUSIC PUBLISHING EUROPE LIMITED


Scraping page 1: 100%|██████████| 17/17 [00:43<00:00,  2.56s/it]


No more pages to scrape.


Processing company number: 16/25 - Company number 03431011 - SONY-ATV MUSIC PUBLISHING (UK) LIMITED

Folder created: RESULT/Company number 03431011_SONY-ATV MUSIC PUBLISHING (UK) LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:10<00:00,  2.82s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 5/5 [00:09<00:00,  1.99s/it]


No more pages to scrape.


Processing company number: 17/25 - UK establishment number BR020612 - SONY EUROPE B.V.

Folder created: RESULT/UK establishment number BR020612_SONY EUROPE B.V.
Company UK establishment number BR020612 not found or does not have a valid filing history page.
Processing company number: 18/25 - Company number 02019657 - SONY MUSIC ENTERTAINMENT INTERNATIONAL LIMITED

Folder created: RESULT/Company number 02019657_SONY MUSIC ENTERTAINMENT INTERNATIONAL LIMITED


Scraping page 1:  28%|██▊       | 7/25 [00:23<00:58,  3.25s/it]

Failed to download. Status code: 502
Retrying in 5 seconds...


Scraping page 1: 100%|██████████| 25/25 [01:09<00:00,  2.77s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 23/23 [00:21<00:00,  1.06it/s]


No more pages to scrape.


Processing company number: 19/25 - Company number 04120046 - SONY GLOBAL TREASURY SERVICES PLC

Folder created: RESULT/Company number 04120046_SONY GLOBAL TREASURY SERVICES PLC


Scraping page 1: 100%|██████████| 24/24 [00:12<00:00,  1.95it/s]


No more pages to scrape.


Processing company number: 20/25 - Company number 01481009 - SONY MUSIC ENTERTAINMENT ARIOLA RECORDS LTD.

Folder created: RESULT/Company number 01481009_SONY MUSIC ENTERTAINMENT ARIOLA RECORDS LTD.


Scraping page 1: 100%|██████████| 25/25 [00:05<00:00,  4.79it/s]


Next page clicked.


Scraping page 2: 100%|██████████| 20/20 [00:00<00:00, 35.30it/s]


No more pages to scrape.


Processing company number: 21/25 - Company number 01442233 - SONY MUSIC ENTERTAINMENT ARISTA RECORDS LTD.

Folder created: RESULT/Company number 01442233_SONY MUSIC ENTERTAINMENT ARISTA RECORDS LTD.


Scraping page 1: 100%|██████████| 25/25 [00:05<00:00,  4.75it/s]


Next page clicked.


Scraping page 2: 100%|██████████| 20/20 [00:01<00:00, 16.78it/s]


No more pages to scrape.


Processing company number: 22/25 - Company number 03196274 - SONY MUSIC INTERACTIVE & VIDEO LTD.

Folder created: RESULT/Company number 03196274_SONY MUSIC INTERACTIVE & VIDEO LTD.


Scraping page 1: 100%|██████████| 25/25 [00:05<00:00,  4.30it/s]


Next page clicked.


Scraping page 2: 100%|██████████| 7/7 [00:09<00:00,  1.41s/it]


No more pages to scrape.


Processing company number: 23/25 - Company number 03185450 - SONY MUSIC ENTERTAINMENT UK HOLDINGS LIMITED

Folder created: RESULT/Company number 03185450_SONY MUSIC ENTERTAINMENT UK HOLDINGS LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:10<00:00,  2.84s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 7/7 [00:05<00:00,  1.40it/s]


No more pages to scrape.


Processing company number: 24/25 - Company number 01471066 - SONY MUSIC ENTERTAINMENT UK LIMITED

Folder created: RESULT/Company number 01471066_SONY MUSIC ENTERTAINMENT UK LIMITED


Scraping page 1: 100%|██████████| 25/25 [01:20<00:00,  3.24s/it]


Next page clicked.


Scraping page 2: 100%|██████████| 25/25 [00:39<00:00,  1.57s/it]


No more pages to scrape.


Processing company number: 25/25 - Company number OE029601 - SONY EUROPE B.V.

Folder created: RESULT/Company number OE029601_SONY EUROPE B.V.
No more pages to scrape.


