In [None]:
import networkx as nx
import psycopg2
import sys
import os
current_directory = os.getcwd()
target_directory = os.path.abspath(os.path.join(current_directory, "..", ".."))
sys.path.append(target_directory)
from Infrastructure import VIZ_Visualizations


class GraphBuilder():
    def __init__(self , str_db_name = "GLEIF_db"):
        self.conn = psycopg2.connect(dbname = str_db_name, user="Matthew_Pisinski", password="matt1", host="localhost", port="5432")    
        self.conn.autocommit = True
        self.cursor = self.conn.cursor()
        self.DIRECT = 'IS_DIRECTLY_CONSOLIDATED_BY'
        self.ULTIMATE = 'IS_ULTIMATELY_CONSOLIDATED_BY'
        self.BRANCH = 'IS_INTERNATIONAL_BRANCH_OF'
        self.SUBFUND = 'IS_SUBFUND_OF'
        self.FEEDER = 'IS_FEEDER_TO'
        self.FUND_MANAGED = 'IS_FUND-MANAGED_BY'

        #self.COMPANY_RELATIONSHIP = [self.DIRECT , self.ULTIMATE]
        self.OTHER_RELATIONSHIPS = [self.BRANCH, self.SUBFUND, self.FEEDER, self.FUND_MANAGED]
        
    def fetch_all_relationships(self):
        self.cursor.execute("""
        SELECT startnode, endnode, relationshiptype, relationshipstatus, registrationstatus, nextrenewaldate
        FROM gleif_relationship_data
        """)
        list_relationships = self.cursor.fetchall()

        self.conn.close()
        
        return list_relationships
    
    def filter_relationships_to_include(self , list_relationships , str_relationships_to_include = None):
        
        list_direct_relationships = []
        list_ultimate_relationships = []
        list_other_relationships = []
        
        
        for tuple_relationship in list_relationships:
            if str_relationships_to_include == "Only_Company":
                if tuple_relationship[2] == self.DIRECT:
                    list_direct_relationships.append(tuple_relationship)
                elif tuple_relationship[2] == self.ULTIMATE:
                    list_ultimate_relationships.append(tuple_relationship)
            elif str_relationships_to_include == "Only_Fund":
                if tuple_relationship[2] in self.OTHER_RELATIONSHIPS:
                    list_other_relationships.append(tuple_relationship)
            else:
                if tuple_relationship[2] == self.DIRECT:
                    list_direct_relationships.append(tuple_relationship)
                elif tuple_relationship[2] == self.ULTIMATE:
                    list_ultimate_relationships.append(tuple_relationship)
                elif tuple_relationship[2] in self.OTHER_RELATIONSHIPS:
                    list_other_relationships.append(tuple_relationship)
        
        return list_direct_relationships , list_ultimate_relationships , list_other_relationships
        
    def initialize_node_attributes(self , list_relationships , G_overall):
        for tuple_relationship in list_relationships:
            str_child_node = tuple_relationship[0]
            str_parent_node = tuple_relationship[1]
            
            if str_child_node not in G_overall.nodes:
                G_overall.add_node(str_child_node, child_relationships={}, parent_relationships={})
            # Initialize parent node if not present
            if str_parent_node not in G_overall.nodes:
                G_overall.add_node(str_parent_node, child_relationships={}, parent_relationships={})
            
        return G_overall
        
    def create_relationship_attribute_dict(self , str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date):
        """
        Creates a dictionary containing all attributes of a relationship.
        """
        return {'relationship_type': str_relationship_type, 'relationship_status': str_relationship_status,
            'registration_status': str_registration_status, 'next_renewal_date': str_next_renewal_date
        }
    
    def process_direct_relationships(self , list_direct_relationships , G_overall , G_direct):
        
        for tuple_rel in list_direct_relationships:
            str_child_lei, str_parent_lei, str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date = tuple_rel
            # Add nodes to G_direct (already added to G)
            G_direct.add_node(str_child_lei)
            G_direct.add_node(str_parent_lei)
            # Add edge to G and G_direct
            G_overall.add_edge(str_child_lei, str_parent_lei)
            G_direct.add_edge(str_child_lei, str_parent_lei)
            # Create relationship dictionary
            relationship_dict = self.create_relationship_attribute_dict(str_relationship_type = str_relationship_type, str_relationship_status = str_relationship_status, str_registration_status = str_registration_status, str_next_renewal_date = str_next_renewal_date)
            # Update node attributes in G
            G_overall.nodes[str_child_lei]['parent_relationships'].setdefault(str_parent_lei, []).append(relationship_dict)
            G_overall.nodes[str_parent_lei]['child_relationships'].setdefault(str_child_lei, []).append(relationship_dict)

        return G_overall , G_direct
    
    def process_other_relationships(self , list_other_relationships , G_overall):
        # Add other relationships (non-direct, non-ultimate) only to G
        for tuple_rel in list_other_relationships:
            str_child_lei, str_parent_lei, str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date = tuple_rel

            # Add edge to G
            G_overall.add_edge(str_child_lei, str_parent_lei)

            # Create relationship dictionary
            relationship_dict = self.create_relationship_attribute_dict(
                str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date)

            # Update node attributes in G
            G_overall.nodes[str_child_lei]['parent_relationships'].setdefault(str_parent_lei, []).append(relationship_dict)
            G_overall.nodes[str_parent_lei]['child_relationships'].setdefault(str_child_lei, []).append(relationship_dict)    

        return G_overall
    
    def process_ultimate_relationships(self , list_ultimate_relationships , G_overall , G_direct):
        
        for tuple_rel in list_ultimate_relationships:
            str_child_lei, str_ultimate_parent_lei, str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date = tuple_rel

            G_direct.add_node(str_child_lei)
            G_direct.add_node(str_ultimate_parent_lei)
            
            if not G_direct.has_edge(str_child_lei, str_ultimate_parent_lei):
                if not nx.has_path(G_direct, str_child_lei, str_ultimate_parent_lei):
                    G_overall.add_edge(str_child_lei, str_ultimate_parent_lei)

                    # Create relationship dictionary
                    relationship_dict = self.create_relationship_attribute_dict(
                    str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date)

                    # Update node attributes in G
                    G_overall.nodes[str_child_lei]['parent_relationships'].setdefault(str_ultimate_parent_lei, []).append(relationship_dict)
                    G_overall.nodes[str_ultimate_parent_lei]['child_relationships'].setdefault(str_child_lei, []).append(relationship_dict)
            else:
                G_overall.add_edge(str_child_lei, str_ultimate_parent_lei)

                # Create relationship dictionary
                relationship_dict = self.create_relationship_attribute_dict(
                    str_relationship_type, str_relationship_status, str_registration_status, str_next_renewal_date)


                # Update node attributes in G
                G_overall.nodes[str_child_lei]['parent_relationships'].setdefault(str_ultimate_parent_lei, []).append(relationship_dict)
                G_overall.nodes[str_ultimate_parent_lei]['child_relationships'].setdefault(str_child_lei, []).append(relationship_dict)
                
        return G_overall
        
    def build_graph(self, list_relationships , str_relationships_to_include = None):
        G_overall = nx.DiGraph()
        G_direct = nx.DiGraph()

        list_direct_relationships, list_ultimate_relationships, list_other_relationships = self.filter_relationships_to_include(list_relationships = list_relationships , str_relationships_to_include = str_relationships_to_include)
        
        
        G_overall = self.initialize_node_attributes(list_relationships = list_relationships , G_overall = G_overall)
        
        
        if list_direct_relationships:
            G_overall , G_direct = self.process_direct_relationships(G_direct = G_direct , G_overall = G_overall , list_direct_relationships = list_direct_relationships)
        
        if list_other_relationships:
            G_overall = self.process_other_relationships(list_other_relationships = list_other_relationships , G_overall = G_overall)
        
        if list_ultimate_relationships:
            G_overall = self.process_ultimate_relationships(list_ultimate_relationships = list_ultimate_relationships , G_overall = G_overall , G_direct = G_direct)
        
        return G_overall
        
    def extract_company_ecosystems(self , G_overall):
        """
        Extracts ecosystems (connected components) from the graph.

        :param G: The graph to extract ecosystems from
        :return: A list of subgraphs, each representing a company ecosystem
        """
        # Convert to undirected graph to find connected components
        UG_overall = G_overall.to_undirected()
        # Find connected components
        list_connected_components = list(nx.connected_components(UG_overall))
        # Extract subgraphs for each component
        list_ecosystems = []
        for component in list_connected_components:
            # Create subgraph
            subgraph = G_overall.subgraph(component).copy()
            list_ecosystems.append(subgraph)

        return list_ecosystems
    
    def create_leis_to_graph(self , list_ecosystems):
        """
        Creates a dictionary mapping each LEI (node) to its corresponding ecosystem graph.

        :param ecosystems: List of NetworkX DiGraph objects, each representing a company ecosystem.
        :return: Dictionary with LEI as keys and their corresponding DiGraph as values.
        """
        dict_leis_to_graph = {}
        for graph in list_ecosystems:
            for node in graph.nodes():
                dict_leis_to_graph[node] = graph
        return dict_leis_to_graph
        
    def graph_processor(self , include_company_relationships = True, include_fund_relationships = True):
        """
        Main function to build the graph and extract company ecosystems.

        :param include_company_relationships: Boolean flag to include company relationships (direct and ultimate).
        :param include_fund_relationships: Boolean flag to include fund relationships.
        :return: List of company ecosystem subgraphs and mapping from LEI to graphs.
        """
        # Build the overall graph with node attributes
        list_relationships = self.fetch_all_relationships()

        if not include_company_relationships:
            G_overall = self.build_graph(list_relationships = list_relationships , str_relationships_to_include = "Only_Fund")
        if not include_fund_relationships:
            G_overall = self.build_graph(list_relationships = list_relationships , str_relationships_to_include = "Only_Company")
        else:
            G_overall = self.build_graph(list_relationships = list_relationships , str_relationships_to_include = "All")
        
        # Extract company ecosystems
        list_ecosystems = self.extract_company_ecosystems(G_overall = G_overall)        

        dict_leis_to_graph = self.create_leis_to_graph(list_ecosystems = list_ecosystems)

        return list_ecosystems, dict_leis_to_graph
    
    def build_subgraph_for_node(self, lei):
        """
        Build a subgraph for a given LEI, handling direct and ultimate relationships appropriately.
        """
        G_sub = nx.DiGraph()
        visited = set()
        queue = [lei]

        # First process direct relationships
        while queue:
            current_lei = queue.pop(0)
            if current_lei in visited:
                continue
            visited.add(current_lei)
            G_sub.add_node(current_lei)

            # Fetch outgoing direct relationships
            self.cursor.execute("""
                SELECT startnode, endnode, relationshiptype
                FROM gleif_relationship_data
                WHERE startnode = %s AND relationshiptype = %s
            """, (current_lei, self.DIRECT))
            relationships = self.cursor.fetchall()

            for startnode, endnode, rel_type in relationships:
                G_sub.add_node(endnode)
                G_sub.add_edge(startnode, endnode, relationship_type=rel_type)
                if endnode not in visited:
                    queue.append(endnode)

        # Now process ultimate relationships
        # Traverse nodes in G_sub
        for node in list(G_sub.nodes):
            # Fetch ultimate relationships where node is the start node
            self.cursor.execute("""
                SELECT startnode, endnode, relationshiptype
                FROM gleif_relationship_data
                WHERE startnode = %s AND relationshiptype = %s
            """, (node, self.ULTIMATE))
            relationships = self.cursor.fetchall()

            for startnode, endnode, rel_type in relationships:
                # Check if there is already a path from startnode to endnode in G_sub
                if not nx.has_path(G_sub, startnode, endnode):
                    G_sub.add_node(endnode)
                    G_sub.add_edge(startnode, endnode, relationship_type=rel_type)

        return G_sub


In [None]:
obj_graph_builder = GraphBuilder()
list_ecosystems, dict_leis_to_graph = obj_graph_builder.graph_processor()

In [5]:
conn = psycopg2.connect(dbname = "GLEIF_db", user="Matthew_Pisinski", password="matt1", host="localhost", port="5432")    
conn.autocommit = True
cursor = conn.cursor()

Trying to figure out how to build a graph dynamically from the given data

In [None]:
def fetch_node_data(cursor , str_start_node):
    query = f"SELECT * FROM gleif_relationship_data WHERE startnode = %s"
    cursor.execute(query, (str_start_node,))

    rows = cursor.fetchall()
    return rows

In [6]:
rows = fetch_node_data(cursor = cursor , str_start_node = "010PWNH4K3BLIC3I7R03")
display(rows)

[(12,
  '010PWNH4K3BLIC3I7R03',
  '549300COKYB5EGSU1838',
  'IS_DIRECTLY_CONSOLIDATED_BY',
  'ACTIVE',
  'LAPSED',
  '2024-10-07T17:03:24+02:00'),
 (13,
  '010PWNH4K3BLIC3I7R03',
  '549300B2Q47IR0CR5B54',
  'IS_ULTIMATELY_CONSOLIDATED_BY',
  'ACTIVE',
  'LAPSED',
  '2024-10-07T17:03:24+02:00')]

Messing around with NEO4j to get it running and working

In [2]:
from neo4j import GraphDatabase

# Connection details
uri = "bolt://localhost:7687"  # Adjust if Neo4j is on a different host/port
username = "neo4j"  # Your Neo4j username
password = "gradient"  # Your Neo4j password

# Create the driver
driver = GraphDatabase.driver(uri, auth=(username, password))

# Connect to the test database
with driver.session(database="gleifcompanygraphstestdb") as session:
    # Example query to ensure connection works
    session.run("CREATE (n:TestNode {name: 'Test'})")
    print("Connected to the test database and created a test node!")

Connected to the test database and created a test node!
