In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [16]:
def scrape_sdi_links(url, output_dir, target_links):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = soup.select('a')
        
        content = {}
        for link in links:
            text = link.text.strip()
            if text in target_links:
                href = link.get('href')
                content[text] = href
        
        return content
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return {}

def scrape_sdi(input_path, output_dir):
    df = pd.read_csv(input_path)
    sdi = df[['RIC', 'SDI']]

    target_links = [
        "Complete list of substantial shareholders",
        "Consolidated list of substantial shareholders",
        "List of notices filed by substantial shareholders",
        "Complete list of directors",
        "List of notices filed by directors",
        "List of all notices"
    ]
    
    for link_text in target_links:
        sdi[link_text] = None
        
    for index, row in sdi.iterrows():
        url = row['SDI']
        if pd.notna(url):
            print(f"Scraping {url}")
            scraped_links = scrape_sdi_links(url, output_dir, target_links)
            
            for link_text, href in scraped_links.items():
                sdi.at[index, link_text] = 'https://di.hkex.com.hk/di/'+href
        
    sdi.to_csv(output_dir+'/sdi_scraped_links.csv', index=False)



In [53]:
def scrape_report(output_dir):
    df = pd.read_csv(output_dir+'/sdi_scraped_links.csv')

    def scrape_shareholders_table(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            table = soup.find('table', {'id': 'grdPaging'})
            if not table:
                return pd.DataFrame()
            
            headers = [th.text.strip() for th in table.find('tr', class_='boldtxtw').find_all('td')]
            
            rows = []
            for tr in table.find_all('tr')[1:]:  # Skip header row
                tds = tr.find_all('td', class_='tbCell')
                if len(tds) == 5:  # Ensure we have all 5 columns
                    row = [
                        tds[0].find('a').text.strip() if tds[0].find('a') else tds[0].text.strip(),  # Form Serial Number
                        tds[1].text.strip(),  # Name of substantial shareholder
                        tds[2].text.strip(),  # Number of shares interested
                        tds[3].text.strip(),  # % of issued voting shares
                        tds[4].text.strip()   # Date of last notice filed
                    ]
                    rows.append(row)
            
            return pd.DataFrame(rows, columns=headers)
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            return pd.DataFrame()

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        ric = row['RIC']
        url = row['Complete list of substantial shareholders']
        
        if pd.notna(url):
            print(f"Scraping data for RIC: {ric}")
            table_data = scrape_shareholders_table(url)
            
            if not table_data.empty:
                
                # Create folder if it doesn't exist
                folder_path = os.path.join(output_dir, ric[:4])
                os.makedirs(folder_path, exist_ok=True)
                
                # Save the CSV file in the corresponding folder
                file_path = os.path.join(folder_path, f"{ric}_substantial_shareholders.csv")
                table_data.to_csv(file_path, index=False)
                print(f"Data saved to {file_path}")

    print("Scraping and saving completed.")

In [54]:
if __name__ == "__main__":
    path = '../data_explorer/faf_documents.csv'
    output = '../src'
    # scrape_sdi(path,output)
    scrape_report(output)

Scraping data for RIC: 1477.HK
Data saved to ../src\1477\1477.HK_substantial_shareholders.csv
Scraping data for RIC: 0011.HK
Data saved to ../src\0011\0011.HK_substantial_shareholders.csv
Scraping data for RIC: 0151.HK
Data saved to ../src\0151\0151.HK_substantial_shareholders.csv
Scraping data for RIC: 0806.HK
Data saved to ../src\0806\0806.HK_substantial_shareholders.csv
Scraping data for RIC: 0909.HK
Data saved to ../src\0909\0909.HK_substantial_shareholders.csv
Scraping data for RIC: 0975.HK
Data saved to ../src\0975\0975.HK_substantial_shareholders.csv
Scraping data for RIC: 1398.HK
Data saved to ../src\1398\1398.HK_substantial_shareholders.csv
Scraping data for RIC: 1658.HK
Data saved to ../src\1658\1658.HK_substantial_shareholders.csv
Scraping data for RIC: 1787.HK
Data saved to ../src\1787\1787.HK_substantial_shareholders.csv
Scraping data for RIC: 3759.HK
Data saved to ../src\3759\3759.HK_substantial_shareholders.csv
Scraping data for RIC: 3969.HK
Data saved to ../src\3969\396