So this will be jupyter notebook code for the front end part of the GLEIF code. Will allow people to easily access the data programatically and will be most likely used 

In [None]:
import pandas as pd
import sqlite3
import json
import pickle
import flatdict
import networkx as nx
import re
from rapidfuzz import process, fuzz
from collections import defaultdict
import string
import time
import sys
import os

current_directory = os.getcwd()
target_directory = os.path.abspath(os.path.join(current_directory, "..", "..", ".."))
sys.path.append(target_directory)
from D_Infastructure import VIZ_Visualizations

In [2]:
#this will be loaded in from a database but here for now

with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_graph.pickle" , "rb") as file:
    dict_leis_graph = pickle.load(file)
    
with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\pickled_objs\df_level_1_data.pickle" , "rb") as file:
    df_level_1_data = pickle.load(file)
    
with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\pickled_objs\dict_company_names_leis.pickle" , "rb") as file:
    dict_company_names_leis = pickle.load(file)

In [3]:
class GLEIFCompanySearch:
            
    def expand_abbreviations(self, name):
        """
        Expand common abbreviations in the company name.
        """
        abbreviation_mapping = {
            'int': 'international',
            'intl': 'international',
            'tech': 'technology',
            'svc': 'services',
            'mfg': 'manufacturing',
            'mktg': 'marketing',
            'hvac': 'heating ventilation and air conditioning',
            'ag': 'agriculture',
            'ai': 'artificial intelligence',
            # Add more as needed
        }
        # Split the name into words
        words = name.split()
        # Replace abbreviations
        expanded_words = [abbreviation_mapping.get(word, word) for word in words]
        # Join back into a string
        return ' '.join(expanded_words)
    
    def preprocess_name(self, name):
        """
        Preprocess the company name by:
        - Removing common suffixes
        - Removing punctuation
        - Converting to lowercase
        - Expanding abbreviations
        """
        # Remove common company suffixes
        suffixes = [
            'inc', 'llc', 'ltd', 'corp', 'corporation', 'incorporated',
            'co', 'company', 'plc', 'limited', 'gmbh', 'sa', 'bv', 'ag', 'oy', 'llp', 'lp'
        ]
        # Create regex pattern to remove suffixes
        suffix_pattern = r'\b(?:' + '|'.join(suffixes) + r')\b\.?'
        name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE)

        # Remove punctuation
        name = name.translate(str.maketrans('', '', string.punctuation))

        # Convert to lowercase and strip whitespace
        name = name.lower().strip()

        # Expand abbreviations
        name = self.expand_abbreviations(name)

        return name
    
    def create_company_index(self, list_company_names, index_length=3):
        """
        Preprocess the company names and create an index based on the first `index_length` characters.

        Args:
            list_company_names (list): List of official company names.
            index_length (int): Number of characters to use for indexing (default is 3).

        Returns:
            original_names (list): List of original company names.
            preprocessed_names (list): List of preprocessed company names.
            index (dict): Dictionary mapping prefix to list of indices.
        """
        index = defaultdict(list)
        original_names = []
        preprocessed_names = []

        for idx, name in enumerate(list_company_names):
            preprocessed = self.preprocess_name(name)
            if not preprocessed:
                continue  # Skip empty strings after preprocessing
            original_names.append(name)
            preprocessed_names.append(preprocessed)
            prefix = preprocessed[:index_length] if len(preprocessed) >= index_length else preprocessed
            index[prefix].append(len(preprocessed_names) - 1)  # Store the index of the name

            # Optional: Progress tracking for large datasets
            if (idx + 1) % 100000 == 0:
                print(f"Processed {idx + 1} company names.")

        return original_names, preprocessed_names, index
        
    
    def search_companies(self, user_input, original_names, preprocessed_names, index, limit=10, index_length=3):
        """
        Search for companies matching the user_input.

        Args:
            user_input (str): The input company name to search for.
            original_names (list): List of official company names.
            preprocessed_names (list): List of preprocessed company names.
            index (dict): The company index created by create_company_index.
            limit (int): The number of top matches to return.
            index_length (int): The length of the prefix used for indexing.

        Returns:
            list of tuples: Each tuple contains (original_company_name, score)
        """
        if not user_input or not user_input.strip():
            return []
        
        # Preprocess the input
        user_input_processed = self.preprocess_name(user_input)

        # Determine the prefix
        prefix = user_input_processed[:index_length] if len(user_input_processed) >= index_length else user_input_processed

        # Retrieve the subset of company indices from the index
        possible_indices = index.get(prefix, [])

        if not possible_indices:
            # If no matches found in the exact prefix, try with a shorter prefix
            if len(prefix) > 1:
                prefix = prefix[:2]
                possible_indices = index.get(prefix, [])

        if not possible_indices:
            # Further reduce prefix length if necessary
            if len(prefix) > 1:
                prefix = prefix[:1]
                possible_indices = index.get(prefix, [])

        if not possible_indices:
            # As a fallback, consider searching the entire dataset
            # Note: This is not recommended for very large datasets due to performance
            # For 2 million entries, it's better to refine the indexing strategy
            print("No matches found within the indexed prefixes. Consider refining your search or indexing strategy.")
            return []

        # Extract preprocessed names for matching
        choices = [preprocessed_names[i] for i in possible_indices]
        original_choices = [original_names[i] for i in possible_indices]

        # Use RapidFuzz's process.extract with token_set_ratio scorer
        matches = process.extract(
            user_input_processed,
            choices,
            scorer=fuzz.token_set_ratio,
            limit=limit
        )

        # Map back to original names with scores
        results = []
        for match, score, match_idx in matches:
            original_name = original_choices[match_idx]
            results.append( (original_name, score) )

        return results

    

In [4]:
list_company_names = [key for key in dict_company_names_leis.keys()]

In [None]:
#This is what the class will look like
class GLEIFAnalysis:
    
    def __init__(self , bool_every_relationship_type = False , bool_only_companies = False , bool_only_funds = False) -> None:  
        self.bool_every_relationship_type = bool_every_relationship_type
        self.bool_only_companies = bool_only_companies
        self.bool_only_funds = bool_only_funds
        
        if self.bool_every_relationship_type == True:
            str_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_graph.pickle"
            self.dict_leis_graph = self.load_graph_data(str_file_path)
            
        if self.bool_only_companies == True:
            str_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_graph_company_only.pickle"
            self.dict_leis_graph = self.load_graph_data(str_file_path)
            
        if self.bool_only_funds == True:
            str_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\dict_leis_to_fund_company_only.pickle"
            self.dict_leis_graph = self.load_graph_data(str_file_path)


    def load_graph_data(self , str_file_path):
        with open(str_file_path, "rb") as file:
            return pickle.load(file)
    
    def company_search(self, str_company_name):
        obj_gleif_company_search = GLEIFCompanySearch()
        list_original_names, list_preprocessed_names, index = obj_gleif_company_search.create_company_index(list_company_names, index_length=3)
        matches = obj_gleif_company_search.search_companies(str_company_name, list_original_names, list_preprocessed_names, index, limit=100, index_length=3)
        
        return matches   
    
    def helper_interactive_graph_generator(self , graph):
        """
        Function used to automatically generate a graph for a company ecosystem when queried.
        """
        obj_Visualizations = VIZ_Visualizations.Visualizations()
        str_html_content = obj_Visualizations.visualize_graph_interactive_html(G = graph)
        obj_Visualizations.display_html_in_browser(html_content = str_html_content)
        
        
    def helper_get_node_attributes(self , graph , str_node):
        return graph.nodes[str_node]

    def search_node(self , df_level_1_data , str_company_name , bool_display_results = False , bool_return_results = False , bool_all_output = False , bool_node_data = False , bool_single_lei_data = False , bool_lei_neighbor_data = False , bool_lei_graph = False):
        """
        
        
        
        @param: str_company_name - the legal official company name being inputted. So user wants either the meta data for that company, neighboring edges data, graph, or everything.
        @bool_display_results - parameter where the user can indicate if they want the data they are accessing to be displayed
        @param: bool_all_output - boolean variable indicating whether the user wants the entire graph of the company output
        @param: bool_single_lei_data - boolean variable indicating whether the user wants only meta data regarding the company being queried.  
        @param: bool_lei_neighbor_data - boolean variable indicating whether the user wants only meta data regarding its direct neighboring companies within its respective company graph. 
        @param: bool_lei_graph - boolean variable indicating whether the user would like to view the associated company ecosystem graph. 
        
        @call: - 
        
        @return: - 
        """
        if bool_all_output:
            bool_node_data = True
            bool_single_lei_data = True
            bool_lei_neighbor_data = True
            bool_lei_graph = True
        
        try:
            graph = self.dict_leis_graph[str_company_name]
        except KeyError:
            print("Company not found in database")
            sys.exit()
        dict_results = {}  # Dictionary to store results

        
        if bool_single_lei_data == True:
            dict_company_meta_data = json.loads(df_level_1_data[df_level_1_data['data'].str.contains(str_company_name, na=False)].iloc[0]['data'])        
            if bool_display_results == True:
                print(f"Showing meta data for {str_company_name}")
                display(dict_company_meta_data)
            dict_results['company_meta_data'] = dict_company_meta_data
           
            
        if bool_node_data == True:
            dict_attributes = self.helper_get_node_attributes(graph = graph , str_node = str_company_name)
            if bool_display_results == True:
                print(f"Displaying attribute data stored at the {str_company_name} node:")
                display(dict_attributes)
            dict_results['node_attributes'] = dict_attributes
        
        if bool_lei_neighbor_data == True:
            list_neighbors = list(graph.neighbors(str_company_name))
            dict_results["neighbors"] = list_neighbors
            if bool_display_results == True:
                print(f"List of neighboring nodes for node {str_company_name}:")
                display(list_neighbors)
                for neighbor in list_neighbors:
                    print(f"Displaying attribute data for {neighbor} node:")
                    print(self.helper_get_node_attributes(graph = graph, str_node = neighbor))
            
        if bool_lei_graph == True:
            self.helper_interactive_graph_generator(graph = graph)    
        
        if bool_return_results == True:
            return dict_results
        
    def check_if_related(self , list_nodes, bool_return_path = False):
        """
        
        """
        try:
            graph = self.dict_leis_graph[list_nodes[0]]
        except KeyError:
            print("One or more of the nodes which were inputted are do not exist in our database!")
            sys.exit()    
        
        
        for i in range(len(list_nodes) - 1):
                list_full_path = []
                source = list_nodes[i]
                target = list_nodes[i + 1]
                if not nx.has_path(graph, source, target):  # Check if path exists
                        print("There is no path between this group of nodes")
                        return None
                path = nx.shortest_path(graph, source=source, target=target)
                list_full_path.extend(path[:-1])  # Exclude last node to avoid duplication
                list_full_path.append(list_nodes[-1])  # Add the last node
                print(f"There is a path between this group of nodes: {list_full_path}")
                if bool_return_path == True:
                        return list_full_path

In [None]:
"""nodes = ["2138003BXMFSMKRSAA14", "254900ZVA3CKR9KO3K08"]
obj_GLIEF_Analysis = GLEIFAnalysis(bool_every_relationship_type = True)
obj_GLIEF_Analysis.check_if_related(list_nodes = nodes)"""

In [None]:
#display(dict_leis_graph)
"""graph = dict_leis_graph["YXV7SD2336DP03R26H81"]
neighbors = list(graph.neighbors("5299009PD6KJQQD9BQ74"))
display(neighbors)
nodes_list = list(graph.nodes())
display(nodes_list)"""
nodes = ["5299009PD6KJQQD9BQ74", "549300RUNVJBS1PI2Y96"]
obj_GLIEF_Analysis = GLEIFAnalysis(bool_every_relationship_type = True)
obj_GLIEF_Analysis.check_if_related(list_nodes = nodes)
str_node = "5299009PD6KJQQD9BQ74"

attributes = obj_GLIEF_Analysis.search_node(df_level_1_data = df_level_1_data , bool_lei_neighbor_data = True, bool_node_data = True , str_company_name = str_node , bool_display_results = True)

matches = obj_GLIEF_Analysis.company_search("Amazon")
display(matches)

In [None]:
obj_GLIEF_Analysis = GLEIFAnalysis(bool_only_companies = True)
obj_GLIEF_Analysis.search_node(df_level_1_data = df_level_1_data , bool_lei_graph = True, str_company_name = "5299009PD6KJQQD9BQ74")

In [None]:
obj_GLIEF_Analysis = GLEIFAnalysis(bool_only_companies = True)
obj_GLIEF_Analysis.search_node(df_level_1_data = df_level_1_data , bool_single_lei_data = True , str_company_name = "hjfhj")

In [6]:
obj_GLIEF_Analysis = GLEIFAnalysis(bool_every_relationship_type= True)
dict_results = obj_GLIEF_Analysis.search_node(df_level_1_data = df_level_1_data , bool_display_results= True, bool_all_output = True , str_company_name = "ZXTILKJKG63JELOEG630")

Showing meta data for ZXTILKJKG63JELOEG630


{'LEI': {'$': 'ZXTILKJKG63JELOEG630'},
 'Entity': {'LegalName': {'@xml:lang': 'en', '$': 'AMAZON.COM, INC.'},
  'LegalAddress': {'@xml:lang': 'en',
   'FirstAddressLine': {'$': 'C/O CORPORATION SERVICE COMPANY'},
   'AdditionalAddressLine': [{'$': '251 LITTLE FALLS DRIVE'}],
   'City': {'$': 'WILMINGTON'},
   'Region': {'$': 'US-DE'},
   'Country': {'$': 'US'},
   'PostalCode': {'$': '19808'}},
  'HeadquartersAddress': {'@xml:lang': 'en',
   'FirstAddressLine': {'$': '410 Terry Ave North'},
   'City': {'$': 'Seattle'},
   'Region': {'$': 'US-WA'},
   'Country': {'$': 'US'},
   'PostalCode': {'$': '98109'}},
  'RegistrationAuthority': {'RegistrationAuthorityID': {'$': 'RA000602'},
   'RegistrationAuthorityEntityID': {'$': '2620453'}},
  'LegalJurisdiction': {'$': 'US-DE'},
  'EntityCategory': {'$': 'GENERAL'},
  'LegalForm': {'EntityLegalFormCode': {'$': 'XTIQ'}},
  'EntityStatus': {'$': 'ACTIVE'},
  'EntityCreationDate': {'$': '1996-05-28T00:00:00.000Z'}},
 'Registration': {'InitialReg

Displaying attribute data stored at the ZXTILKJKG63JELOEG630 node:


{'child_relationships': {'5493006K9CUYTRQ9Z556': [{'relationship_type': 'IS_DIRECTLY_CONSOLIDATED_BY',
    'relationship_status': 'ACTIVE',
    'registration_status': 'PUBLISHED',
    'initial_registration_date': '2012-06-06T15:52:00.000Z',
    'last_update_date': '2024-10-08T17:44:06.864Z',
    'next_renewal_date': '2025-10-01T12:41:06.314Z'},
   {'relationship_type': 'IS_ULTIMATELY_CONSOLIDATED_BY',
    'relationship_status': 'ACTIVE',
    'registration_status': 'PUBLISHED',
    'initial_registration_date': '2012-06-06T15:52:00.000Z',
    'last_update_date': '2024-10-08T17:44:06.864Z',
    'next_renewal_date': '2025-10-01T12:41:06.314Z'}],
  '549300CLGXJ3ZDL9PZ27': [{'relationship_type': 'IS_DIRECTLY_CONSOLIDATED_BY',
    'relationship_status': 'ACTIVE',
    'registration_status': 'PUBLISHED',
    'initial_registration_date': '2022-11-25T00:00:00Z',
    'last_update_date': '2024-09-16T11:31:00Z',
    'next_renewal_date': '2024-11-22T09:13:00Z'},
   {'relationship_type': 'IS_ULTIMATEL

List of neighboring nodes for neighboring node ZXTILKJKG63JELOEG630:


[]

Interactive graph 'Interactive_Graph' generated successfully.


Whats left:
1. Cleaning/Modularzing the code as much as possible so We Can use it Anywhere
- Clean the graph builder
- Modularize the code in this jupyter notebook as much as possible
- For the graph loader, add checks for if the graph is too large
2. Renaming the lei ids to be the company names as listed on the GLIEF
3. Updating the data
4. Unit Tests
5. Docs
6. Code Review
7. Get it front ended and working as a functioning pipeline in the server

Some GPT ass code

import re
from rapidfuzz import process, fuzz
from collections import defaultdict
import string