So this function will be in a seperate class which is responsible for obtaining the download links for each set of GLEIF data. That way each script can take care of and process the data invidually, but the same function logic for grabbing a link would not have to be repeated in each script.

In [2]:
from bs4 import BeautifulSoup
import os
import requests
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sqlite3
import zipfile
from pyvis.network import Network
import networkx as nx
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import urllib.parse

class GLEIF_Data_Helpers:
    def __init__(self, bool_Level_1 = False, bool_Level_2_Trees = False, bool_Level_2_Reporting_Exceptions = False):
        self.bool_Level_1 = bool_Level_1
        self.bool_Level_2_Trees = bool_Level_2_Trees
        self.bool_Level_2_Reporting_Exceptions = bool_Level_2_Reporting_Exceptions

    def get_level_download_links(self):
        """
        This function uses selenium to webscrape the download link for all Level 1 Data in the GLEIF database.
        
        @return: str_download_link - the link which is used to download the entire GLEIF level 1
        """
        #Maybe new function

        driver_path = (r"C:\Drivers\Google\chromedriver-win64\chromedriver-win64\chromedriver.exe")
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service)
        driver.get(url = "https://www.gleif.org/en/lei-data/gleif-golden-copy/download-the-golden-copy#/")

        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'CybotCookiebotDialogBodyButton'))
        )

        cookie_button.click()

        download_buttons = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'gc-download-button'))
        )
        
        if self.bool_Level_1 == True:
            download_buttons[0].click()
        if self.bool_Level_2_Trees == True:
            download_buttons[1].click()
        if self.bool_Level_2_Reporting_Exceptions == True:
            download_buttons[2].click()
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        driver.close()

        str_download_link = ((soup.find_all("a" , class_ = "gc-icon gc-icon--json"))[0])["href"]
        
        return str_download_link        
    
    def create_sql_instance(self, str_db_name, str_table_name):
        # Connect to the SQLite database with WAL mode enabled
        conn = sqlite3.connect(f'{str_db_name}.db', timeout=10)  # Set a timeout for waiting on locks
        conn.execute('PRAGMA journal_mode=WAL;')  # Enable WAL mode for concurrency
        cursor = conn.cursor()

        # Create the table with an id and JSON field (storing JSON as TEXT)
        cursor.execute(f'''
        CREATE TABLE IF NOT EXISTS {str_table_name} (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        data TEXT
        )
        ''')
        
        return conn, cursor
    
    def unpacking_GLEIF_zip_files(self , str_download_link , str_zip_file_path , str_unpacked_zip_file_path):
        session = requests.Session()
        zip_file = session.get(url = str_download_link)

        with open(str_zip_file_path, 'wb') as f:
            f.write(zip_file.content)

        with zipfile.ZipFile(str_zip_file_path, 'r') as zip_ref:
            os.makedirs(str_unpacked_zip_file_path, exist_ok=True)
            zip_ref.extractall(str_unpacked_zip_file_path)
        
        str_unpacked_zip_file_name = os.listdir(str_unpacked_zip_file_path)[0]
        str_json_file_path = str_unpacked_zip_file_path + "\\" + str_unpacked_zip_file_name
        
        return str_json_file_path

ModuleNotFoundError: No module named 'pyvis'

This is the backend logic which produces a list of graphs, where each element represents a company ecosystem

In [None]:
import json
import bigjson
import sqlite3
import networkx as nx


class GLEIFLevel2Data:
    def __init__(self):
        self.str_level_2_unpacked_zip_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\Zip_Files\GLEIF\Level_2_Data\RR_CDF_Data\Unpacked_Zip"
        self.str_level_2_zip_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\Zip_Files\GLEIF\Level_2_Data\RR_CDF_Data\.Level_2_RR_CDF.zip"
        self.obj_data_helpers = GLEIF_Data_Helpers(bool_Level_2_Trees = True)

    def insert_json_data(self, json_data , conn , cursor , str_table_name):
        cursor.execute(f'''
        INSERT INTO {str_table_name}  (data)
        VALUES (?)
        ''', (json.dumps(json_data),))
        conn.commit()
    
    def storing_GLEIF_data_in_database(self):
        str_level_2_download_link = self.obj_data_helpers.get_level_download_links()
        str_json_file_path = self.obj_data_helpers.unpacking_GLEIF_zip_files(str_download_link = str_level_2_download_link , str_zip_file_path = self.str_level_2_zip_file_path , str_unpacked_zip_file_path = self.str_level_2_unpacked_zip_file_path)
        conn, cursor = self.obj_data_helpers.create_sql_instance(str_table_name = "Level_2_Tree_Data" , str_db_name = "GLEIF_Data")
        
        """with open(str_json_file_path, 'r', encoding='utf-8') as file:
            # Read the file content
            json_content = file.read()  # Read the entire file content as a string
            
            # Print the first 50,000 characters
            print(json_content[:50000])"""
        
        with open(str_json_file_path, 'r' , encoding='utf-8') as file:
            test = bigjson.load(file)
            #counter = 1
            for dict_lei in test["relations"]:
                #if counter != 15000:
                self.insert_json_data(json_data = dict_lei.to_python() , conn = conn , cursor = cursor , str_table_name = "Level_2_Tree_Data")
                    #counter += 1
                #else:
                    #break"""
        
        conn.close()
        

Graph Algorithm

In [3]:
from pyvis.network import Network
import networkx as nx
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import urllib.parse

def visualize_graph_interactive_html(G: nx.DiGraph, title: str = 'Interactive_Graph') -> str:
    """
    Visualizes the graph using PyVis and returns the HTML content as a string.
    
    :param G: The NetworkX directed graph to visualize.
    :param title: The title of the graph.
    :return: HTML string of the interactive graph.
    """
    # Initialize PyVis Network with remote CDN resources for better compatibility
    net = Network(notebook=False, directed=True, height='750px', width='100%', cdn_resources='remote')
    
    # Add nodes with titles
    for node, data in G.nodes(data=True):
        label = node
        title_text = (
            f"Node: {node}<br>"
            f"Parents: {len(data.get('parent_relationships', {}))}<br>"
            f"Children: {len(data.get('child_relationships', {}))}"
        )
        net.add_node(node, label=label, title=title_text)
    
    # Add edges
    for source, target in G.edges():
        net.add_edge(source, target)
    
    # Customize physics for better layout
    net.show_buttons(filter_=['physics'])
    net.toggle_physics(True)
    
    # Generate HTML as a string
    try:
        html_content = net.generate_html()
        print(f"Interactive graph '{title}' generated successfully.")
        return html_content
    except Exception as e:
        print(f"Error generating the interactive graph: {e}")
        return ""

def display_html_in_browser(html_content: str):
    """
    Displays the given HTML content in a web browser using Selenium and keeps the browser open.
    
    :param html_content: The HTML content to display.
    """
    if not html_content:
        print("No HTML content to display.")
        return
    
    # Encode the HTML content for use in a data URL
    encoded_html = urllib.parse.quote(html_content)
    data_url = f"data:text/html;charset=utf-8,{encoded_html}"
    
    # Setup Selenium WebDriver (Chrome in this example)
    try:
        # Initialize Chrome WebDriver using webdriver_manager for automatic driver management
        service = ChromeService(ChromeDriverManager().install())
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")  # Optional: start maximized
        options.add_experimental_option("detach", True)  # Keep the browser open after script ends
        driver = webdriver.Chrome(service=service, options=options)
        
        # Open the data URL
        driver.get(data_url)
        
        #print("Interactive graph opened in the browser.")
        
        # Optional: Keep the script running to maintain the WebDriver session
        # Uncomment the following lines if you prefer the browser to stay open until you manually close it
        # import time
        # while True:
        #     time.sleep(10)
        
    except Exception as e:
        print(f"Error opening the interactive graph in the browser: {e}")


ModuleNotFoundError: No module named 'pyvis'

In [4]:
import sqlite3
import json
import networkx as nx
from typing import List, Dict, Optional, Set
from datetime import datetime

# Define relationship types and categorize them
class RelationshipType:
    DIRECT = 'IS_DIRECTLY_CONSOLIDATED_BY'
    ULTIMATE = 'IS_ULTIMATELY_CONSOLIDATED_BY'
    BRANCH = 'IS_INTERNATIONAL_BRANCH_OF'
    SUBFUND = 'IS_SUBFUND_OF'
    FEEDER = 'IS_FEEDER_TO'
    FUND_MANAGED = 'IS_FUND-MANAGED_BY'

    DIRECT_RELATIONSHIPS = {DIRECT}
    ULTIMATE_RELATIONSHIPS = {ULTIMATE}
    OTHER_RELATIONSHIPS = {BRANCH, SUBFUND, FEEDER, FUND_MANAGED}

def relationship_list_generator():
        list_rows = []
        
        conn = sqlite3.connect(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\GLEIF_Data.db", check_same_thread=False)
        cursor = conn.cursor()
        
        cursor.execute('SELECT data FROM Level_2_Tree_Data')
        
        rows = cursor.fetchall()
        
        for row in rows:
            json_text = row[0]  # Get the JSON text from the tuple
            json_dict = json.loads(json_text)  # Parse JSON text into a dictionary
            list_relationship = [json_dict["RelationshipRecord"]["Relationship"]["StartNode"]["NodeID"]["$"] , json_dict["RelationshipRecord"]["Relationship"]["EndNode"]["NodeID"]["$"] , json_dict["RelationshipRecord"]["Relationship"]["RelationshipType"]["$"] , json_dict["RelationshipRecord"]["Relationship"]["RelationshipStatus"]["$"] , json_dict["RelationshipRecord"]["Registration"]["RegistrationStatus"]["$"] , json_dict["RelationshipRecord"]["Registration"]["InitialRegistrationDate"]["$"] , json_dict["RelationshipRecord"]["Registration"]["LastUpdateDate"]["$"] , json_dict["RelationshipRecord"]["Registration"]["NextRenewalDate"]["$"]]
            list_rows.append(list_relationship) # Add the dictionary to the list
            
        conn.close()
        
        return list_rows

def create_relationship_dict(relationship_type, relationship_status, registration_status, initial_registration_date, last_update_date, next_renewal_date):
    """
    Creates a dictionary containing all attributes of a relationship.
    """
    return {
        'relationship_type': relationship_type,
        'relationship_status': relationship_status,
        'registration_status': registration_status,
        'initial_registration_date': initial_registration_date,
        'last_update_date': last_update_date,
        'next_renewal_date': next_renewal_date
    }

def build_graph(relationships: List[List[str]], relationship_types_to_include: Optional[Set[str]] = None) -> nx.DiGraph:
    """
    Builds a directed graph from the list of relationships,
    adding relationship attributes directly to the nodes.

    :param relationships: List of relationships, each represented as a list
    :param relationship_types_to_include: Optional set of relationship types to include.
    :return: A networkx DiGraph representing the company relationships
    """
    G = nx.DiGraph()
    G_direct = nx.DiGraph()

    # If no specific relationship types are provided, include all
    if relationship_types_to_include is None:
        relationship_types_to_include = {
            RelationshipType.DIRECT,
            RelationshipType.ULTIMATE,
            *RelationshipType.OTHER_RELATIONSHIPS
        }

    display(relationship_types_to_include)
    
    direct_relationships = []
    ultimate_relationships = []
    other_relationships = []

    # Separate relationships into direct, ultimate, and others based on the types to include
    for rel in relationships:
        relationship_type = rel[2]
        if relationship_type not in relationship_types_to_include:
            continue  # Skip relationships not in the specified types
        if relationship_type == RelationshipType.DIRECT:
            direct_relationships.append(rel)
        elif relationship_type == RelationshipType.ULTIMATE:
            ultimate_relationships.append(rel)
        elif relationship_type in RelationshipType.OTHER_RELATIONSHIPS:
            other_relationships.append(rel)
        else:
            # Handle unknown relationship types if necessary
            other_relationships.append(rel)

    # Initialize node attributes
    for rel in relationships:
        child_lei, parent_lei, _, _, _, _, _, _ = rel
        # Initialize child node if not present
        if child_lei not in G.nodes:
            G.add_node(child_lei, child_relationships={}, parent_relationships={})
        # Initialize parent node if not present
        if parent_lei not in G.nodes:
            G.add_node(parent_lei, child_relationships={}, parent_relationships={})

    # Add direct relationships to both G and G_direct
    for rel in direct_relationships:
        child_lei, parent_lei, relationship_type, relationship_status, registration_status, \
        initial_registration_date, last_update_date, next_renewal_date = rel

        # Add nodes to G_direct (already added to G)
        G_direct.add_node(child_lei)
        G_direct.add_node(parent_lei)

        # Add edge to G and G_direct
        G.add_edge(child_lei, parent_lei)
        G_direct.add_edge(child_lei, parent_lei)

        # Create relationship dictionary
        relationship_dict = create_relationship_dict(
            relationship_type, relationship_status, registration_status,
            initial_registration_date, last_update_date, next_renewal_date
        )

        # Update node attributes in G
        G.nodes[child_lei]['parent_relationships'].setdefault(parent_lei, []).append(relationship_dict)
        G.nodes[parent_lei]['child_relationships'].setdefault(child_lei, []).append(relationship_dict)

    # Add other relationships (non-direct, non-ultimate) only to G
    for rel in other_relationships:
        child_lei, parent_lei, relationship_type, relationship_status, registration_status, \
        initial_registration_date, last_update_date, next_renewal_date = rel

        # Add edge to G
        G.add_edge(child_lei, parent_lei)

        # Create relationship dictionary
        relationship_dict = create_relationship_dict(
            relationship_type, relationship_status, registration_status,
            initial_registration_date, last_update_date, next_renewal_date
        )

        # Update node attributes in G
        G.nodes[child_lei]['parent_relationships'].setdefault(parent_lei, []).append(relationship_dict)
        G.nodes[parent_lei]['child_relationships'].setdefault(child_lei, []).append(relationship_dict)

    # Process ultimate relationships
    for rel in ultimate_relationships:
        child_lei, ultimate_parent_lei, relationship_type, relationship_status, registration_status, \
        initial_registration_date, last_update_date, next_renewal_date = rel

        # Add nodes to G_direct (already added to G)
        G_direct.add_node(child_lei)
        G_direct.add_node(ultimate_parent_lei)

        # Check if a direct edge exists
        direct_edge_exists = G_direct.has_edge(child_lei, ultimate_parent_lei)

        if direct_edge_exists:
            # Direct edge exists; ultimate relationship is NOT redundant
            # Add edge to G
            G.add_edge(child_lei, ultimate_parent_lei)

            # Create relationship dictionary
            relationship_dict = create_relationship_dict(
                relationship_type, relationship_status, registration_status,
                initial_registration_date, last_update_date, next_renewal_date
            )

            # Update node attributes in G
            G.nodes[child_lei]['parent_relationships'].setdefault(ultimate_parent_lei, []).append(relationship_dict)
            G.nodes[ultimate_parent_lei]['child_relationships'].setdefault(child_lei, []).append(relationship_dict)
        else:
            # Check if a path exists via direct relationships excluding direct edge
            if nx.has_path(G_direct, child_lei, ultimate_parent_lei):
                # Path exists via other nodes; ultimate relationship is redundant
                continue
            else:
                # No path exists; add edge to G
                G.add_edge(child_lei, ultimate_parent_lei)

                # Create relationship dictionary
                relationship_dict = create_relationship_dict(
                    relationship_type, relationship_status, registration_status,
                    initial_registration_date, last_update_date, next_renewal_date
                )

                # Update node attributes in G
                G.nodes[child_lei]['parent_relationships'].setdefault(ultimate_parent_lei, []).append(relationship_dict)
                G.nodes[ultimate_parent_lei]['child_relationships'].setdefault(child_lei, []).append(relationship_dict)

    return G

def extract_company_ecosystems(G: nx.DiGraph) -> List[nx.DiGraph]:
    """
    Extracts ecosystems (connected components) from the graph.

    :param G: The graph to extract ecosystems from
    :return: A list of subgraphs, each representing a company ecosystem
    """
    # Convert to undirected graph to find connected components
    UG = G.to_undirected()

    # Find connected components
    connected_components = list(nx.connected_components(UG))

    # Extract subgraphs for each component
    ecosystems = []
    for component in connected_components:
        # Create subgraph
        subgraph = G.subgraph(component).copy()
        ecosystems.append(subgraph)

    return ecosystems

def create_leis_to_graph(ecosystems: List[nx.DiGraph]) -> Dict[str, nx.DiGraph]:
    """
    Creates a dictionary mapping each LEI (node) to its corresponding ecosystem graph.

    :param ecosystems: List of NetworkX DiGraph objects, each representing a company ecosystem.
    :return: Dictionary with LEI as keys and their corresponding DiGraph as values.
    """
    leis_to_graph = {}
    for graph in ecosystems:
        for node in graph.nodes():
            leis_to_graph[node] = graph
    return leis_to_graph

def graph_processor(include_company_relationships: bool = True, include_fund_relationships: bool = True):
    """
    Main function to build the graph and extract company ecosystems.

    :param include_company_relationships: Boolean flag to include company relationships (direct and ultimate).
    :param include_fund_relationships: Boolean flag to include fund relationships.
    :return: List of company ecosystem subgraphs and mapping from LEI to graphs.
    """
    # Build the overall graph with node attributes
    list_relationships = relationship_list_generator()

    # Build the set of relationship types to include
    relationship_types_to_include = set()

    if include_company_relationships:
        relationship_types_to_include.update({RelationshipType.DIRECT, RelationshipType.ULTIMATE})

    if include_fund_relationships:
        relationship_types_to_include.update(RelationshipType.OTHER_RELATIONSHIPS)

    if not relationship_types_to_include:
        # If no relationships are included, default to including all relationships
        relationship_types_to_include.update({
            RelationshipType.DIRECT,
            RelationshipType.ULTIMATE,
            *RelationshipType.OTHER_RELATIONSHIPS
        })

    G = build_graph(list_relationships, relationship_types_to_include)

    # Extract company ecosystems
    ecosystems = extract_company_ecosystems(G)        

    leis_to_graph = create_leis_to_graph(ecosystems=ecosystems)

    return ecosystems, leis_to_graph


In [7]:
list_relationships = relationship_list_generator()
display(type(list_relationships))

list

In [5]:
ecosystems, dict_leis_graph = graph_processor()

OperationalError: no such table: Level_2_Tree_Data

In [None]:
display(len(ecosystems))

In [None]:
display(len(ecosystems[7]))

In [None]:
ecosystems_company_only, dict_leis_to_graph_company_only = graph_processor(
    include_company_relationships=True,
    include_fund_relationships=False
)

In [None]:
display(len(ecosystems_company_only))

In [None]:
if ecosystems:
    # Access the second ecosystem (index 1)
    eco = ecosystems[7]
    
    print(f"\nEcosystem 2:")
    print(f"Nodes: {list(eco.nodes())}")
    
    for node, attrs in eco.nodes(data=True):
        print(f"\nNode: {node}")
        print(f"  Child Relationships:")
        for child_node, relationships_list in attrs.get('child_relationships', {}).items():
            for rel in relationships_list:
                print(f"    Child Node: {child_node}")
                for key, value in rel.items():
                    print(f"      {key.replace('_', ' ').title()}: {value}")
        if not attrs.get('child_relationships'):
            print("    (No child relationships)")
        
        print(f"  Parent Relationships:")
        for parent_node, relationships_list in attrs.get('parent_relationships', {}).items():
            for rel in relationships_list:
                print(f"    Parent Node: {parent_node}")
                for key, value in rel.items():
                    print(f"      {key.replace('_', ' ').title()}: {value}")
        if not attrs.get('parent_relationships'):
            print("    (No parent relationships)")
    
    # Visualize the second ecosystem
    str_graph_eco = visualize_graph_interactive_html(eco, title='Ecosystem 2 - Interactive Graph')
    display_html_in_browser(str_graph_eco)
    
else:
    print("No ecosystems found.")


In [None]:
ecosystems_fund_only, dict_leis_to_fund_company_only = graph_processor(
    include_company_relationships=False,
    include_fund_relationships=True
)

In [None]:
display(len(ecosystems_fund_only))

In [None]:
yaya = dict_leis_to_graph_company_only["Q774KI4AW80FHFW33O61"]

In [None]:
if ecosystems:
    # Access the second ecosystem (index 1)
    eco = yaya
    
    print(f"\nEcosystem 2:")
    print(f"Nodes: {list(eco.nodes())}")
    
    for node, attrs in eco.nodes(data=True):
        print(f"\nNode: {node}")
        print(f"  Child Relationships:")
        for child_node, relationships_list in attrs.get('child_relationships', {}).items():
            for rel in relationships_list:
                print(f"    Child Node: {child_node}")
                for key, value in rel.items():
                    print(f"      {key.replace('_', ' ').title()}: {value}")
        if not attrs.get('child_relationships'):
            print("    (No child relationships)")
        
        print(f"  Parent Relationships:")
        for parent_node, relationships_list in attrs.get('parent_relationships', {}).items():
            for rel in relationships_list:
                print(f"    Parent Node: {parent_node}")
                for key, value in rel.items():
                    print(f"      {key.replace('_', ' ').title()}: {value}")
        if not attrs.get('parent_relationships'):
            print("    (No parent relationships)")
    
    # Visualize the second ecosystem
    str_graph_eco = visualize_graph_interactive_html(eco, title='Ecosystem 2 - Interactive Graph')
    display_html_in_browser(str_graph_eco)
    
else:
    print("No ecosystems found.")


In [None]:
import pickle

with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_graph.pickle" , "wb") as file:
    pickle.dump(dict_leis_graph, file)  # dump the object as a binary file

In [None]:
with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_graph_company_only.pickle" , "wb") as file:
    pickle.dump(dict_leis_to_graph_company_only, file)  # dump the object as a binary file

In [None]:
with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_to_fund_company_only.pickle" , "wb") as file:
    pickle.dump(dict_leis_to_fund_company_only, file)  # dump the object as a binary file

In [None]:
import logging

In [None]:
if not os.path.exists("logging"): #if the logging folder doesnt exist in the directory, make it
    os.makedirs("../logging")

logging.basicConfig(filename = "logging/Company_Facts_Baseline.log" , level = logging.DEBUG, format = '%(levelname)s: %(message)s' , filemode = "w")

if not os.path.exists("file_lib"):
    os.makedirs("../file_lib")


In [None]:
import os
import sys

current_directory = os.getcwd()
target_directory = os.path.abspath(os.path.join(current_directory, "..", "..", ".."))
sys.path.append(target_directory)

from D_Infastructure import System_Helpers

In [21]:
display(current_directory)

'C:\\Users\\mattp\\Work_Related\\Systematic_Trading\\Library\\B_Notebooks\\GLIEF_company_data_pipeline\\Work_bench'

In [22]:
display(target_directory)

'C:\\Users\\mattp\\Work_Related\\Systematic_Trading\\Library'