In [None]:
from bs4 import BeautifulSoup
import os
import requests
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sqlite3
import zipfile
import pandas as pd
import json
import pickle
import bigjson
import json
import sys
current_directory = os.getcwd()
target_directory = os.path.abspath(os.path.join(current_directory, "..", "..", ".."))
sys.path.append(target_directory)
from D_Infastructure import System_Helpers

class GLEIF_Backill_Helpers:
    def __init__(self, bool_Level_1 = False, bool_Level_2_Trees = False, bool_Level_2_Reporting_Exceptions = False):
        self.bool_Level_1 = bool_Level_1
        self.bool_Level_2_Trees = bool_Level_2_Trees
        self.bool_Level_2_Reporting_Exceptions = bool_Level_2_Reporting_Exceptions

    def get_level_download_links(self):
        """
        This function uses selenium to webscrape the download link for all Level 1 Data in the GLEIF database.
        
        @return: str_download_link - the link which is used to download the entire GLEIF level 1
        """
        #Maybe new function

        driver_path = (r"C:\Drivers\Google\chromedriver-win64\chromedriver-win64\chromedriver.exe")
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service)
        driver.get(url = "https://www.gleif.org/en/lei-data/gleif-golden-copy/download-the-golden-copy#/")

        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'CybotCookiebotDialogBodyButton'))
        )

        cookie_button.click()

        download_buttons = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'gc-download-button'))
        )
        
        if self.bool_Level_1 == True:
            download_buttons[0].click()
        if self.bool_Level_2_Trees == True:
            download_buttons[1].click()
        if self.bool_Level_2_Reporting_Exceptions == True:
            download_buttons[2].click()
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        driver.close()

        str_download_link = ((soup.find_all("a" , class_ = "gc-icon gc-icon--json"))[0])["href"]
        
        return str_download_link        
    
    def create_sql_instance(self, str_db_name, str_table_name):
        # Connect to the SQLite database with WAL mode enabled
        conn = sqlite3.connect(f'{str_db_name}.db', timeout=10)  # Set a timeout for waiting on locks
        conn.execute('PRAGMA journal_mode=WAL;')  # Enable WAL mode for concurrency
        cursor = conn.cursor()

        # Create the table with an id and JSON field (storing JSON as TEXT)
        cursor.execute(f'''
        CREATE TABLE IF NOT EXISTS {str_table_name} (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        data TEXT
        )
        ''')
        
        return conn, cursor
    
    def unpacking_GLEIF_zip_files(self , str_download_link , str_zip_file_path , str_unpacked_zip_file_path):
        session = requests.Session()
        zip_file = session.get(url = str_download_link)

        with open(str_zip_file_path, 'wb') as f:
            f.write(zip_file.content)

        with zipfile.ZipFile(str_zip_file_path, 'r') as zip_ref:
            os.makedirs(str_unpacked_zip_file_path, exist_ok=True)
            zip_ref.extractall(str_unpacked_zip_file_path)
        
        str_unpacked_zip_file_name = os.listdir(str_unpacked_zip_file_path)[0]
        str_json_file_path = str_unpacked_zip_file_path + "\\" + str_unpacked_zip_file_name
        
        return str_json_file_path
    
    def company_id_dictionary_generator(self):
        db_path = "GLEIF_Data.db"
        conn = sqlite3.connect(db_path)
        table_name = "Level_1_Data"  # Replace with your table name
        query = f"SELECT * FROM {table_name};"
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        dict_company_names_leis = {}

        for _, row in df.iterrows():
            dict_company_data = json.loads(row.loc["data"])
            dict_company_names_leis[dict_company_data["Entity"]["LegalName"]["$"]] = dict_company_data["LEI"]["$"]
        
        with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\pickled_objs\dict_company_names_leis.pickle" , "wb") as file:
            pickle.dump(dict_company_names_leis , file)
    
    def get_all_level_1_data(self):
        db_path = "GLEIF_Data.db"
        conn = sqlite3.connect(db_path)
        table_name = "Level_1_Data"  # Replace with your table name
        query = f"SELECT * FROM {table_name};"
        df_level_1_data = pd.read_sql_query(query, conn)
        conn.close()
        
        with open(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\pickled_objs\df_level_1_data.pickle" , "wb") as file:
            pickle.dump(df_level_1_data , file)
    
    

In [None]:
def insert_relationship_meta_data(self, list_relationship_data):
        
        self.cursor.executemany("""
            INSERT INTO GLEIF_relationship_data (
            StartNode, EndNode, RelationshipType, 
            RelationshipStatus, RegistrationStatus, InitialRegistrationDate, LastUpdateDate, NextRenewalDate, 
                ManagingLOU,
                ValidationSources,
                ValidationDocuments,
                ValidationRegistration
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
        """, (list_relationship_data))

In [None]:
def extract_event_data(self , dict_data, base_keyword, target_keys):
        """
        Extracts and organizes data for repeated keys in a dictionary based on a base keyword and target keys.

        :param dict_data: Dictionary containing the raw data.
        :param base_keyword: Common substring to identify relevant keys (e.g., "LegalEntityEvents").
        :param target_keys: List of substrings to match keys that should be included in the tuple.
        :return: A list of tuples, one for each numeric suffix group, containing values for the target keys.
        """
        grouped_data = {}


        # Group keys by numeric suffix
        for key, value in dict_data.items():
            if base_keyword in key:
                # Extract the numeric suffix using regex
                match = re.search(r"_(\d+)_", key)
                if not match:
                    continue  # Skip keys without a numeric suffix
                index = int(match.group(1))

                # Extract the part of the key after the numeric suffix
                key_suffix = key.split(f"_{index}_")[-1]

                if index not in grouped_data:
                    grouped_data[index] = {}
                
                # Check if this key matches any of the target keys as a substring
                for target in target_keys:
                    if target in key_suffix:
                        grouped_data[index][target] = value
                        break

        # Create tuples for each group of keys
        result = []
        for index in sorted(grouped_data.keys()):
            # Create a tuple of values for the target keys, using None if a key is missing
            tuple_values = tuple(grouped_data[index].get(target, None) for target in target_keys)
            result.append(tuple_values)

        return result

In [None]:
class GLEIFLevel1Data:
    def __init__(self):
        self.str_level_1_unpacked_zip_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\Zip_Files\GLEIF\Level_1_Data\Unpacked_Zip"
        self.str_level_1_zip_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\Zip_Files\GLEIF\Level_1_Data\Level_1.zip"
        self.obj_backfill_helpers = GLEIF_Backill_Helpers(bool_Level_1 = True)

    def insert_json_data(self, json_data , conn , cursor , str_table_name):
        cursor.execute(f'''
        INSERT INTO {str_table_name}  (data)
        VALUES (?)
        ''', (json.dumps(json_data),))
        conn.commit()
    
    def storing_GLEIF_data_in_database(self):
        str_level_1_download_link = self.obj_data_helpers.get_level_download_links()
        str_json_file_path = self.obj_data_helpers.unpacking_GLEIF_zip_files(str_download_link = str_level_1_download_link , str_zip_file_path = self.str_level_1_zip_file_path , str_unpacked_zip_file_path = self.str_level_1_unpacked_zip_file_path)
        conn, cursor = self.obj_data_helpers.create_sql_instance(str_table_name = "Level_1_Data" , str_db_name = "GLEIF_Data")
        
        with open(str_json_file_path, 'r' , encoding='utf-8') as file:
            dict_leis = bigjson.load(file)
            #counter = 1
            for dict_lei in dict_leis["records"]:
                #if counter != 15000:
                self.insert_json_data(json_data = dict_lei.to_python() , conn = conn , cursor = cursor , str_table_name = "Level_1_Data")
                    #counter += 1
                #else:
                    #break
        conn.close()            
        
        obj_system_helpers = System_Helpers.SystemHelpers()
        obj_system_helpers.delete_file_directory(str_file_path = self.str_level_1_unpacked_zip_file_path , bool_directory = True)
        obj_system_helpers.delete_file_directory(str_file_path = self.str_level_1_zip_file_path , bool_file = True)
                    

In [None]:
class GLEIFLevel2Data:
    def __init__(self):
        self.str_level_2_unpacked_zip_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\Zip_Files\GLEIF\Level_2_Data\RR_CDF_Data\Unpacked_Zip"
        self.str_level_2_zip_file_path = r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\Zip_Files\GLEIF\Level_2_Data\RR_CDF_Data\Level_2_RR_CDF.zip"
        self.obj_backfill_helpers = GLEIF_Backill_Helpers(bool_Level_2_Trees = True)

    def insert_json_data(self, json_data , conn , cursor , str_table_name):
        cursor.execute(f'''
        INSERT INTO {str_table_name}  (data)
        VALUES (?)
        ''', (json.dumps(json_data),))
        conn.commit()
    
    def storing_GLEIF_data_in_database(self):
        str_level_2_download_link = self.obj_data_helpers.get_level_download_links()
        str_json_file_path = self.obj_data_helpers.unpacking_GLEIF_zip_files(str_download_link = str_level_2_download_link , str_zip_file_path = self.str_level_2_zip_file_path , str_unpacked_zip_file_path = self.str_level_2_unpacked_zip_file_path)
        conn, cursor = self.obj_data_helpers.create_sql_instance(str_table_name = "Level_2_Tree_Data" , str_db_name = "GLEIF_Data")
        
        with open(str_json_file_path, 'r' , encoding='utf-8') as file:
            test = bigjson.load(file)
            for dict_lei in test["relations"]:
                self.insert_json_data(json_data = dict_lei.to_python() , conn = conn , cursor = cursor , str_table_name = "Level_2_Tree_Data")
                    
            
        conn.close()

        obj_system_helpers = System_Helpers.SystemHelpers()
        obj_system_helpers.delete_file_directory(str_file_path = self.str_level_2_unpacked_zip_file_path , bool_directory = True)
        obj_system_helpers.delete_file_directory(str_file_path = self.str_level_2_zip_file_path , bool_file = True)

In [None]:
def rename_table(old_table_name, new_table_name):
    """
    Renames a table in the SQLite database.

    Parameters:
        db_name (str): The SQLite database name.
        old_table_name (str): The current name of the table.
        new_table_name (str): The new name for the table.
    """
    try:
        # Connect to the database
        conn = sqlite3.connect(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\GLEIF_Data.db", check_same_thread=False)
        cursor = conn.cursor()

        # Rename the table
        cursor.execute(f"ALTER TABLE {old_table_name} RENAME TO {new_table_name}")
        conn.commit()

        print(f"Table '{old_table_name}' has been renamed to '{new_table_name}'.")
    except sqlite3.Error as e:
        print(f"Error renaming table: {e}")
    finally:
        conn.close()

In [None]:
import sqlite3

In [None]:
def flatten_dict(dict_input):
        """ 
        This function flattens a dictionary by changing the keys for nested dictionaries to be that of their nested key path from the root of the dictionary
        it is using basic DFS on a tree (the dictionary).

        @param: dict_input - json response input, nested dictionary as input basically 
        
        @return: the flattened dictionary. 
        """
        dict_flattened = {}
        def flatten(current_dict , parent_key = ''):
            for key , value in current_dict.items():
                new_key = f"{parent_key}_{key}" if parent_key else key
                if isinstance(value , dict):
                    flatten(value , new_key )
                else:
                    dict_flattened[new_key] = value

        flatten(dict_input)
        return dict_flattened


In [None]:
def rename_table(old_table_name, new_table_name):
    """
    Renames a table in the SQLite database.

    Parameters:
        db_name (str): The SQLite database name.
        old_table_name (str): The current name of the table.
        new_table_name (str): The new name for the table.
    """
    try:
        # Connect to the database
        conn = sqlite3.connect(r"C:\Users\mattp\Work_Related\Systematic_Trading\Library\B_Notebooks\GLIEF_company_data_pipeline\GLEIF_Data.db", check_same_thread=False)
        cursor = conn.cursor()

        # Rename the table
        cursor.execute(f"ALTER TABLE {old_table_name} RENAME TO {new_table_name}")
        conn.commit()

        print(f"Table '{old_table_name}' has been renamed to '{new_table_name}'.")
    except sqlite3.Error as e:
        print(f"Error renaming table: {e}")
    finally:
        conn.close()

In [None]:
class GLEIFLevel2Data:
    def __init__(self , bool_log = True , str_db_name = "GLEIF_test_db" , bool_downloaded = True):
        self.obj_backfill_helpers = GLEIF_Backill_Helpers(bool_Level_2_Trees = True)
        if bool_log:
            logging_folder = "../logging"  # Adjust the folder path as necessary
    
            if os.path.exists(logging_folder):
                if not os.path.isdir(logging_folder):
                    raise FileExistsError(f"'{logging_folder}' exists but is not a directory. Please remove or rename the file.")
            else:
                os.makedirs(logging_folder)
    
            logging.basicConfig(filename=f"{logging_folder}/GLEIF_Backfill_level_2.log", level=logging.DEBUG, format='%(levelname)s: %(message)s', filemode="w")

        if not bool_downloaded:
            if not os.path.exists("../file_lib"):
                os.makedirs("../file_lib")
                
            str_level_2_download_link = self.obj_backfill_helpers.get_level_download_links()
            self.str_json_file_path = self.obj_backfill_helpers.unpacking_GLEIF_zip_files(str_download_link = str_level_2_download_link , str_unpacked_zip_file_path_name = "Level_2_unpacked" , str_zip_file_path_name = "Level_2.zip")
    
        str_unpacked_zip_file_name = os.listdir(rf"../file_lib/Level_2_unpacked")[0]
        self.str_json_file_path = rf"../file_lib/Level_2_unpacked" + "//" + str_unpacked_zip_file_name
        self.conn = psycopg2.connect(dbname = str_db_name, user="Matthew_Pisinski", password="matt1", host="localhost", port="5432")    
        self.conn.autocommit = True
        self.cursor = self.conn.cursor()
    
    def create_table(self):
        self.cursor.execute("""
                CREATE TABLE IF NOT EXISTS GLEIF_relationship_data (
                id SERIAL PRIMARY KEY,
                StartNode TEXT,                
                EndNode TEXT,
                RelationshipType TEXT,
                RelationshipStatus TEXT,
                RegistrationStatus TEXT,
                NextRenewalDate TEXT
                );
            """)
            
        self.conn.commit()
    
    def insert_relationship(self, list_relationship_data):
        
        self.cursor.execute("""
            INSERT INTO GLEIF_relationship_data (
            StartNode, EndNode, RelationshipType, 
            RelationshipStatus, RegistrationStatus, NextRenewalDate
            ) VALUES (%s, %s, %s, %s, %s, %s);
        """, (list_relationship_data[0], list_relationship_data[1], list_relationship_data[2], list_relationship_data[3], list_relationship_data[4], list_relationship_data[5]))
    
    def process_relationships(self , dict_relationship):
        dict_relationship_flattened = self.obj_backfill_helpers.flatten_dict(dict_input = dict_relationship)
        list_relationship_data = self.obj_backfill_helpers.get_target_values(dict_data = dict_relationship_flattened, subset_string = True, target_keys = ["StartNode" , "EndNode" , "RelationshipType" , "RelationshipStatus" , "RegistrationStatus" , "NextRenewalDate"])
        self.insert_relationship(list_relationship_data = list_relationship_data)
        
    def storing_GLEIF_data_in_database(self):
        
        self.create_table()
        
        with open(self.str_json_file_path, 'r' , encoding='utf-8') as file:
            
            dict_relationship_data = bigjson.load(file)
            for dict_relationship in dict_relationship_data["relations"]:
                dict_record = dict_relationship.to_python()
                self.process_relationships(dict_relationship = dict_record)               
        
        self.conn.close()

In [None]:
def bulk_insert_using_copy(self , table_name , columns, data):
        """Perform a bulk insert using PostgreSQL COPY with an in-memory buffer

        Args:
            table_name (_type_): Name of the table to insert into
            columns (_type_): List of column names for the table
            data (_type_): List of tuples with the data to be inserted
        """
        
        buffer = io.StringIO()
        
        #write data to the buffer
        
        for row in data:
            '''row_converted = [
            x.replace('\\', '\\\\') if isinstance(x, str) else x 
            for x in row]'''
        # Replace None with \N for PostgreSQL NULL representation
        #row_converted = [str(x) if x is not None else '\\N' for x in row_converted]
            #buffer.write('\t'.join(row_converted) + "\n")
            #buffer.write('\t'.join(map(str , row_converted)) + "\n")
            #buffer.write('\t'.join(row_converted) + "\n")
            buffer.write('\t'.join(map(str , row)) + "\n")
        buffer.seek(0) #reset buffer position to the beginning
        
        #Construct the copy query
        #copy_query = f"COPY {table_name} ({', '.join(columns)}) FROM STDIN WITH DELIMITER '\t', NULL '\\N'"
        
        #copy_query = f"""COPY {table_name} ({', '.join(columns)}) FROM STDIN WITH (FORMAT text, DELIMITER E'\\t', NULL '\\N')"""
        copy_query = f"COPY {table_name} ({', '.join(columns)}) FROM STDIN WITH DELIMITER '\t'"
        self.cursor.copy_expert(copy_query , buffer)
        self.conn.commit()