In [1]:
import os
import re
import logging
import psycopg2
import sys
import io
import ijson
import concurrent.futures
import time
from tqdm import tqdm
from datetime import datetime, timezone
import math
from collections import defaultdict
from psycopg2 import sql
current_directory = os.getcwd()
target_directory = os.path.abspath(os.path.join(current_directory, "..", ".."))
sys.path.append(target_directory)

from Production.Backfill import GLEIF_Backfill_Helpers
from Infrastructure import Scraper_helpers

In [2]:
class Testing_Level_1_data:
    def __init__(self):
        self.obj_scraper_helpers = Scraper_helpers.Scraper_Helpers()
        self.obj_backfill_helpers = GLEIF_Backfill_Helpers.GLEIF_Backill_Helpers()
    
    def get_lei_ids(self, str_db_name="GLEIF_test_db"):
        conn = psycopg2.connect(dbname=str_db_name, user="Matthew_Pisinski", password="matt1", host="localhost", port="5432")
        cursor = conn.cursor()
        
        query = "SELECT lei FROM gleif_entity_data ORDER BY lei;"  # ORDER BY clause
        cursor.execute(query)

        column_values = [row[0] for row in cursor.fetchall()]

        cursor.close()
        conn.close()

        return column_values
    
    def list_subset_create(self , list_input , batch_size):
        return [
            list_input[i : i + batch_size] 
            for i in range(0, len(list_input), batch_size)
        ]
        
    def fetch_lei_records_batch(self , batch_of_leis):
        """
        Fetches LEI records for a batch of LEIs in a single request.
        Returns the JSON list `["data"]` if status=200,
        otherwise returns an empty list or some error placeholder.
        """
        base_url = "https://api.gleif.org/api/v1/lei-records"
        lei_filter = ",".join(batch_of_leis)
        url = f"{base_url}?page[size]=200&filter[lei]={lei_filter}"
        
        # Perform the request using your custom function
        response = self.obj_scraper_helpers.spotty_network(url=url)
        
        if response.status_code == 200:
            json_data = response.json()
            return json_data.get("data", [])
        else:
            time.sleep(60)
            response = self.obj_scraper_helpers.spotty_network(url=url)
            if response.status_code == 200:
                json_data = response.json()
                return json_data.get("data", [])
            else:
                print(f"bad batch {batch_of_leis}")
                return None
            
    def multithread_lei_batches(self , list_batched_leis, batch_size=100, max_workers=10):
        """
        1. Splits the LEIs into batches of `batch_size`.
        2. Multi-threads the requests for each batch using `ThreadPoolExecutor`.
        3. Collects `["data"]` from each batch-response into one big list.
        """
        # Create sub-lists (batches) of LEIs
        list_more_batches_leis = self.list_subset_create(batch_size = batch_size , list_input = list_batched_leis)
        
        list_data = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit a batch request to the executor
            future_to_batch = {
                executor.submit(self.fetch_lei_records_batch, batch): batch 
                for batch in list_more_batches_leis
            }
            
            # As each future completes, extend the `all_data` list with the result
            for future in concurrent.futures.as_completed(future_to_batch):
                batch_result = future.result()  # This should be a list of JSON items
                list_data.extend(batch_result)
        
        return list_data
    
    """def get_lei_data_limit(self , str_table_name, max_rows=100000):
        conn = psycopg2.connect(dbname="GLEIF_test_db", user="Matthew_Pisinski", password="matt1", host="localhost", port="5432")
        cursor = conn.cursor()

        # Use ORDER BY on the same column to match the ordering
        query = f"SELECT * FROM {str_table_name} ORDER BY lei LIMIT %s;"
        cursor.execute(query, (max_rows,))

        results = cursor.fetchall()

        # Convert each row to a list, optionally exclude the first column
        all_data = [list(row)[1:] for row in results]

        cursor.close()
        conn.close()

        return all_data"""
    
    
    def get_all_lei_data(self , str_table_name):
        conn = psycopg2.connect(
                dbname="GLEIF_test_db",
                user="Matthew_Pisinski",
                password="matt1",
                host="localhost",
                port="5432"
            )
        cursor = conn.cursor()

        # Define the SQL query to fetch all rows, ordered by 'lei'
        query = f"SELECT * FROM {str_table_name} ORDER BY lei;"
        cursor.execute(query)

        # Retrieve all results
        results = cursor.fetchall()

        # Convert each row to a list and exclude the first column (e.g., an ID or primary key)
        all_data = [list(row)[1:] for row in results]
            
        cursor.close()
        conn.close()
        
        return all_data

    def get_lei_data_for_leis(self, str_table_name, list_leis):
        """
        Fetch all rows from the specified table where 'lei' is in the provided list of LEIs.
        
        Args:
            str_table_name (str): Name of the database table.
            list_leis (list of str): List of LEI IDs to fetch records for.
        
        Returns:
            list of lists: Each inner list represents a row from the table, excluding the first column.
        """
        if not list_leis:
            return []

        # Establish connection
        conn = psycopg2.connect(
            dbname="GLEIF_test_db",
            user="Matthew_Pisinski",
            password="matt1",
            host="localhost",
            port="5432"
        )
        cursor = conn.cursor()

        # Dynamically create placeholders based on the number of LEIs
        placeholders = ','.join(['%s'] * len(list_leis))
        query = sql.SQL("SELECT * FROM {table} WHERE lei IN ({placeholders}) ORDER BY lei;").format(
            table=sql.Identifier(str_table_name),
            placeholders=sql.SQL(placeholders)
        )

        # Execute the query with the list of LEIs as parameters
        cursor.execute(query, list_leis)

        results = cursor.fetchall()

        # Convert each row to a list and exclude the first column if needed
        all_data = [list(row)[1:] for row in results]


        cursor.close()
        conn.close()

        return all_data

        
    def get_dict_map(self , list_input):
        dict_db_data = defaultdict(list)

        for item in list_input:
            dict_db_data[item[0]].append(item)

    # Convert defaultdict to a regular dictionary (optional)
        dict_db_data = dict(dict_db_data)

        return dict_db_data   
    
    def unify_date(self , date_str):
        """
        If there's a 'T', parse the date-time, normalize to UTC, and return 'YYYY-MM-DD'.
        Otherwise, assume it's already just 'YYYY-MM-DD' and return it as-is.
        """
        if date_str is None:
            return None

        # If there's no 'T', skip time-zone parsing entirely
        if 'T' not in date_str:
            return date_str  # e.g. "1969-04-17"

        try:
            # Replace 'Z' with '+00:00' so Python recognizes the time zone
            dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
            dt_utc = dt.astimezone(timezone.utc)
            # Return just the date portion (YYYY-MM-DD)
            return dt_utc.strftime('%Y-%m-%d')
        except Exception as e:
            print(f"Error normalizing date '{date_str}': {e}")
            return date_str  # Fallback: return original string
    
    def clean_date_string(self , list_input , date_indexes , bool_many = False):
        if bool_many == True:
            for sublist in list_input:
                for idx in date_indexes:
                    sublist[idx] = (self.unify_date(sublist[idx]))
        else:
            for idx in date_indexes:
                list_input[idx] = (self.unify_date(list_input[idx]))
            
        return list_input
    
    def extract_other_names(self , dict_record):
        """
        Extracts all 'otherNames_{index}_...' fields from the dictionary
        and returns a list of arrays, one array per 'otherNames' index.
        Each array will include [LEI, name, language, type].
        """
        
        lei = dict_record.get("attributes_lei")
        
        # Pattern to match keys like 'attributes_entity_otherNames_1_name'
        pattern = re.compile(r"^attributes_entity_otherNames_(\d+)_(\w+)$")
        
        # Dictionary to group subfields by index
        # e.g. subdicts['1'] = {'name': 'AUTOCONT s.r.o.', 'language': 'sk', 'type': 'PREVIOUS_LEGAL_NAME'}
        subdicts = {}
        
        for key, value in dict_record.items():
            match = pattern.match(key)
            if match:
                index = match.group(1)  # e.g. '1'
                field_name = match.group(2)  # e.g. 'name', 'language', 'type'
                
                if index not in subdicts:
                    subdicts[index] = {}
                
                subdicts[index][field_name] = value
        
        # Build a list of arrays, each containing the LEI + the extracted fields
        rows = []
        for index, fields in subdicts.items():
            # You can decide which fields to include and in which order
            name = fields.get("name")
            language = fields.get("language")
            name_type = fields.get("type")
            
            row = [lei, name, language, name_type]
            rows.append(row)
        
        return rows
    
    def sort_api_data(self , list_leis , list_unsorted):
        lei_index_map = {lei: idx for idx, lei in enumerate(list_leis)}

        # 2. Sort list_all_data_flattened using this map
        list_all_data_flattened_sorted = sorted(
            list_unsorted,
            key=lambda d: lei_index_map[d["id"]]
        )

        return list_all_data_flattened_sorted
    
    def helper_test_gleif_entity_data(self , list_dict_flat):
        list_leis_api = [dict_api["id"] for dict_api in list_dict_flat]
        list_db_data = self.get_lei_data_for_leis(list_leis = list_leis_api , str_table_name = "gleif_entity_data")
        
        for dict_flat, list_db in tqdm(zip(list_dict_flat, list_db_data), 
                                        total=len(list_dict_flat), 
                                        desc="Processing Records", 
                                        unit="record"):
            list_entity_data = self.obj_backfill_helpers.get_target_values(dict_data = dict_flat, subset_string="attributes" , target_keys = ["id" , "legalName_name", "entity_jurisdiction", "entity_category", "entity_subCategory", "entity_legalForm_id", "entity_legalForm_other", "entity_status", "entity_creationDate", "entity_registeredAt_id", "entity_registeredAs"])
            
            list_entity_data = self.clean_date_string(date_indexes = [8 , 8] , list_input = list_entity_data)
            list_db = self.clean_date_string(date_indexes = [8 , 8] , list_input = list_db)
            
            """date_indexes = [8, 8]  # Specify the indexes of date fields to normalize

            for idx in date_indexes:
                list_entity_data[idx] = (self.unify_date(list_entity_data[idx]))
                list_db[idx] = (self.unify_date(list_db[idx]))"""
                
            if list_entity_data != list_db:
                print("Assert False entity_data")
                display(list_entity_data)
                display(list_db)
            
    def helper_test_other_names_data(self , list_dict_flat):
        """Testing all of the data in the gleif_other_legal_names table"""
        
        list_db_other_name_data = self.get_all_lei_data(str_table_name = "gleif_other_legal_names")
        dict_other_name = self.get_dict_map(list_input = list_db_other_name_data)
            
        for dict_flat in tqdm(list_dict_flat, desc="Processing records"):        
            list_other_possible = (self.extract_other_names(dict_record = dict_flat))
            
            if list_other_possible:
                for list_api_row in list_other_possible:
                    list_api_row.pop(2)
                    list_api_row[1], list_api_row[2] = list_api_row[2], list_api_row[1]
                    
                    list_db_other_name_rows = dict_other_name[list_api_row[0]]
                    if list_api_row not in list_db_other_name_rows:
                        print(f"Test case failed for API row: {list_api_row}")
                        print(f"Database rows for LEI {list_api_row[0]}: {list_db_other_name_rows}")
    
    def helper_test_headquarters(self, list_dict_flat):
        """Testing all of the data in the gleif_headquartersaddress table"""
        list_leis_api = [dict_api["id"] for dict_api in list_dict_flat]
        list_db_data = self.get_lei_data_for_leis(list_leis = list_leis_api , str_table_name = "gleif_headquartersaddress")
            
        for dict_flat, list_db in tqdm(zip(list_dict_flat, list_db_data), 
                                        total=len(list_dict_flat), 
                                        desc="Processing Records", 
                                        unit="record"):
            list_hq_data = self.obj_backfill_helpers.get_target_values(dict_data = dict_flat, subset_string="attributes" , target_keys = ["id" , "entity_headquartersAddress_addressLines_1", "entity_headquartersAddress_addressLines_2", "entity_headquartersAddress_addressLines_3", "entity_headquartersAddress_addressLines_4", "entity_headquartersAddress_city", "entity_headquartersAddress_region", "entity_headquartersAddress_country", "entity_headquartersAddress_postalCode"])
            
                
            if list_hq_data != list_db:
                print("Assert False hq_data")
                display(list_hq_data)
                display(list_db)
    
    def helper_test_legal_address(self , list_dict_flat):
        """Testing all of the data in the gleif_legaladdress table"""
        list_leis_api = [dict_api["id"] for dict_api in list_dict_flat]
        list_db_data = self.get_lei_data_for_leis(list_leis = list_leis_api , str_table_name = "gleif_legaladdress")
            
        for dict_flat, list_db in tqdm(zip(list_dict_flat, list_db_data), 
                                        total=len(list_dict_flat), 
                                        desc="Processing Records", 
                                        unit="record"):
            list_legal_address_data = self.obj_backfill_helpers.get_target_values(dict_data = dict_flat, subset_string="attributes" , target_keys = ["id" , "entity_legalAddress_addressLines_1", "entity_legalAddress_addressLines_2", "entity_legalAddress_addressLines_3", "entity_legalAddress_addressLines_4", "entity_legalAddress_city", "entity_legalAddress_region", "entity_legalAddress_country", "entity_legalAddress_postalCode"])
            
                
            if list_legal_address_data != list_db:
                print("Assert False legal_address_data")
                display(list_legal_address_data)
                display(list_db)
                
    
    
    def helper_test_registration(self , list_dict_flat):
        """Testing all of the data in the gleif_registration_data table"""
        list_leis_api = [dict_api["id"] for dict_api in list_dict_flat]
        list_db_data = self.get_lei_data_for_leis(list_leis = list_leis_api , str_table_name = "gleif_registration_data")
            
        for dict_flat, list_db in tqdm(zip(list_dict_flat, list_db_data), 
                                        total=len(list_dict_flat), 
                                        desc="Processing Records", 
                                        unit="record"):
            list_registration_data = self.obj_backfill_helpers.get_target_values(dict_data = dict_flat, subset_string="attributes" , target_keys = ["id" , "registration_initialRegistrationDate", "registration_lastUpdateDate", "registration_status", "registration_nextRenewalDate", "registration_managingLou", "registration_corroborationLevel", "registration_validatedAt_id", "registration_validatedAs"])
            list_registration_data = self.clean_date_string(date_indexes = [1 , 2 , 4], list_input = list_registration_data)
            list_db = self.clean_date_string(date_indexes = [1 , 2 , 4], list_input = list_db)
            
            """date_indexes = [1, 2, 4]  # Specify the indexes of date fields to normalize

            for idx in date_indexes:
                list_registration_data[idx] = (self.unify_date(list_registration_data[idx]))
                list_db[idx] = (self.unify_date(list_db[idx]))"""
                
            if list_registration_data != list_db:
                print("Assert False registration_data")
                display(list_registration_data)
                display(list_db)
    
    def helper_test_legal_events(self , list_dict_flat):
        """Testing all of the data in the gleif_legalentityevents table"""
        list_db_other_name_data = self.get_all_lei_data(str_table_name = "gleif_legalentityevents")
        dict_legal_events = self.get_dict_map(list_input = list_db_other_name_data)
        
        for dict_flat in tqdm(list_dict_flat, desc="Processing records"):        
                list_api_legal_possible = self.obj_backfill_helpers.extract_event_data(bool_test= True, dict_data = dict_flat, base_keyword = "attributes_entity_eventGroups" , target_keys=["groupType", "status", "type", "effectiveDate", "recordedDate", "validationDocuments"])            
                
                if list_api_legal_possible:
                    
                    str_lei = dict_flat["id"]
                    
                    for index, sublist in enumerate(list_api_legal_possible):
                        # Prepend str_lei to the sublist
                        list_api_legal_possible[index] = [str_lei] + sublist
                    
                    list_api_legal_possible = self.clean_date_string(bool_many=True, date_indexes=[4,5], list_input= list_api_legal_possible)
                    
                    
                    for list_api_legal in list_api_legal_possible:
                        
                        list_db_legal_rows = dict_legal_events[str_lei]
                        
                        list_db_legal_rows = self.clean_date_string(bool_many=True, date_indexes=[4,5], list_input=list_db_legal_rows)
                        
                        if list_api_legal not in list_db_legal_rows:
                            print(f"Test case failed for API row: {list_api_legal}")
                            print(f"Database rows for LEI {list_api_legal[0]}: {list_db_legal_rows}")
                            
    def testing_level_1_data(self , subset_size = 100000):
        list_leis = self.get_lei_ids()
        
        list_batched = self.list_subset_create(list_input = list_leis , batch_size = subset_size)
        #int_total_num_subsets = math.ceil(len(list_leis)/list_batched)
        
        for list_leis in list_batched:
            list_all_data = self.multithread_lei_batches(list_batched_leis = list_leis , batch_size = 200 , max_workers = 5) 
            list_all_data_flattened = [self.obj_backfill_helpers.flatten_dict(dict_input = dict_data) for dict_data in list_all_data]
            list_all_data_flattened_sorted = self.sort_api_data(list_leis = list_leis , list_unsorted = list_all_data_flattened)
            self.helper_test_gleif_entity_data(list_dict_flat = list_all_data_flattened_sorted)
            self.helper_test_other_names_data(list_dict_flat = list_all_data_flattened_sorted)
            self.helper_test_headquarters(list_dict_flat = list_all_data_flattened_sorted)
            self.helper_test_legal_address(list_dict_flat = list_all_data_flattened_sorted)
            #self.helper_test_registration(list_dict_flat = list_all_data_flattened_sorted)
            self.helper_test_legal_events(list_dict_flat = list_all_data_flattened_sorted)
         
    

In [None]:
obj_testing_level_1_data = Testing_Level_1_data()
obj_testing_level_1_data.testing_level_1_data()

Processing Records:   6%|▌         | 5605/100000 [00:00<00:03, 28166.10record/s]

Assert False entity_data


['15955Y83JMXGHHL9WI28',
 'MARTVALD ASSETS AS',
 'NO',
 'GENERAL',
 None,
 'YI42',
 None,
 'ACTIVE',
 '2023-05-24',
 'RA000472',
 '931 588 192']

['15955Y83JMXGHHL9WI28',
 'MARTVALD ASSETS AS',
 'NO',
 'GENERAL',
 None,
 'YI42',
 None,
 'INACTIVE',
 '2023-05-24',
 'RA000472',
 '931 588 192']

Processing Records:  18%|█▊        | 18495/100000 [00:00<00:02, 30763.55record/s]

Assert False entity_data


['2138002TVM8VXUB9A180',
 'SIGNUM RATED II LIMITED',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2004-10-05',
 'RA000086',
 'CR-140196']

['2138002TVM8VXUB9A180',
 'SIGNUM RATED II LIMITED',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2004-10-05',
 'RA000086',
 '140196']

Processing Records:  54%|█████▍    | 54250/100000 [00:01<00:01, 33964.78record/s]

Assert False entity_data


['21380085NPGXFT3MMY37',
 'SIGNUM FINANCE AI LIMITED',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2008-02-19',
 'RA000086',
 'DB-205173']

['21380085NPGXFT3MMY37',
 'Signum Finance AI Limited',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2008-02-19',
 'RA000086',
 '205173']

Processing Records: 100%|██████████| 100000/100000 [00:03<00:00, 32082.54record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 69143.85it/s]
Processing Records: 100%|██████████| 100000/100000 [00:02<00:00, 45748.13record/s]
Processing Records: 100%|██████████| 100000/100000 [00:01<00:00, 53411.55record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 76498.13it/s]
Processing Records: 100%|██████████| 100000/100000 [00:03<00:00, 32890.99record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 65672.08it/s]
Processing Records:  96%|█████████▌| 96239/100000 [00:02<00:00, 45593.38record/s]

Assert False hq_data


['213800TQ7Q5V1RPOCU36',
 'Queensgate House',
 '113 South Church Street',
 None,
 None,
 'George Town',
 None,
 'KY',
 'KY1-1102']

['213800TQ7Q5V1RPOCU36',
 'Queensgate House',
 '113 South Church Street',
 'P. O. Box 1093',
 None,
 'George Town',
 None,
 'KY',
 'KY1-1102']

Processing Records: 100%|██████████| 100000/100000 [00:02<00:00, 44480.19record/s]
Processing Records:  92%|█████████▏| 92451/100000 [00:01<00:00, 51632.58record/s]

Assert False legal_address_data


['213800TQ7Q5V1RPOCU36',
 'Queensgate House',
 '113 South Church Street',
 None,
 None,
 'George Town',
 None,
 'KY',
 'KY1-1102']

['213800TQ7Q5V1RPOCU36',
 'Queensgate House',
 '113 South Church Street',
 'P. O. Box 1093',
 None,
 'George Town',
 None,
 'KY',
 'KY1-1102']

Processing Records: 100%|██████████| 100000/100000 [00:01<00:00, 51412.95record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 74688.24it/s]
Processing Records:   0%|          | 0/100000 [00:00<?, ?record/s]

Assert False entity_data


['213800U6Q41HMWN2VX61',
 'SIGNUM RATED LIMITED',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2003-11-13',
 'RA000086',
 'CR-130633']

['213800U6Q41HMWN2VX61',
 'Signum Rated Limited',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2003-11-19',
 'RA000086',
 '130633']

Processing Records:  34%|███▍      | 34173/100000 [00:01<00:02, 28585.91record/s]

Assert False entity_data


['213800ZFBIQ29HS5DM68',
 'KITHALPA RATED LIMITED',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'ACTIVE',
 '2009-02-06',
 'RA000086',
 'DB-222976']

['213800ZFBIQ29HS5DM68',
 'KITHALPA RATED LIMITED',
 'KY',
 'GENERAL',
 None,
 '8888',
 'Exempt Company',
 'INACTIVE',
 '2009-02-06',
 'RA000086',
 '222976']

Processing Records:  40%|███▉      | 39865/100000 [00:01<00:02, 26475.63record/s]

Assert False entity_data


['222100958E3H3WDMDH39',
 'CP PARTNERS SPF S.A.',
 'LU',
 'GENERAL',
 None,
 '5GGB',
 None,
 'ACTIVE',
 '2010-11-16',
 'RA000432',
 'B156635']

['222100958E3H3WDMDH39',
 'CP PARTNERS SPF S.A.',
 'LU',
 'GENERAL',
 None,
 '5GGB',
 None,
 'INACTIVE',
 '2010-11-16',
 'RA000432',
 'B156635']

Processing Records:  93%|█████████▎| 92503/100000 [00:03<00:00, 24971.74record/s]

Assert False entity_data


['2549006HP4R945EL7I72',
 'SIDANA EXPORTS',
 'IN',
 'GENERAL',
 None,
 'A0PS',
 None,
 'ACTIVE',
 '2021-09-29',
 'RA000754',
 '07AAOFR4838H1ZR']

['2549006HP4R945EL7I72',
 'SIDANA EXPORTS',
 'IN',
 'GENERAL',
 None,
 'A0PS',
 None,
 'ACTIVE',
 '2001-09-19',
 'RA000754',
 '07AAOFR4838H1ZR']

Processing Records: 100%|██████████| 100000/100000 [00:03<00:00, 25983.30record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 56220.77it/s]
Processing Records:   0%|          | 0/100000 [00:00<?, ?record/s]

Assert False hq_data


['213800U6Q41HMWN2VX61',
 'Queensgate House',
 '113 South Church Street',
 None,
 None,
 'George town',
 None,
 'KY',
 'KY1-1102']

['213800U6Q41HMWN2VX61',
 'P. O. Box 1093 Queensgate House',
 '113 South Church Street',
 None,
 None,
 'George town',
 None,
 'KY',
 'KY1-1102']

Processing Records:  32%|███▏      | 31509/100000 [00:00<00:01, 38798.97record/s]

Assert False hq_data


['213800ZB5IYL8NS49A91',
 '357a Amhurst Road',
 None,
 None,
 None,
 'London',
 None,
 'GB',
 'N16 7UX']

['213800ZB5IYL8NS49A91',
 '28 Lordship Park 28 Lordship Park',
 None,
 None,
 None,
 'London',
 'GB-LND',
 'GB',
 'N16 5UD']

Processing Records:  51%|█████▏    | 51363/100000 [00:01<00:01, 37052.66record/s]

Assert False hq_data


['233000K4D2FT0PTGEV30',
 'Ipoly utca 8. 6. em. 9. ajtó',
 None,
 None,
 None,
 'Budapest',
 'HU-BU',
 'HU',
 '1133']

['233000K4D2FT0PTGEV30',
 'Podmaniczky utca 57. 2. em. 14. ajtó',
 None,
 None,
 None,
 'Budapest',
 'HU-BU',
 'HU',
 '1064']

Assert False hq_data


['253400FK33N26XQSXD38',
 'ПЕР БОЛЬШОЙ ОВЧИННИКОВСКИЙ, дом 16',
 None,
 None,
 None,
 'Москва',
 'RU-MOW',
 'RU',
 '115184']

['253400FK33N26XQSXD38',
 'ПЕР. БОЛЬШОЙ ОВЧИННИКОВСКИЙ, Д. 16',
 None,
 None,
 None,
 'Москва',
 'RU-MOW',
 'RU',
 '115184']

Assert False hq_data


['253400V4HY4PH2NE7E79',
 'проезд Георгия Митирева, дом 11',
 None,
 None,
 None,
 'Самара',
 'RU-SAM',
 'RU',
 '443079']

['253400V4HY4PH2NE7E79',
 'проезд Митирева, д. 11',
 None,
 None,
 None,
 'Самара',
 'RU-SAM',
 'RU',
 '443079']

Processing Records:  55%|█████▌    | 55074/100000 [00:01<00:01, 36453.02record/s]

Assert False hq_data


['253400XEMNQ5WBPN5635',
 'улица Декабристов, дом 1',
 None,
 None,
 None,
 'Казань',
 'RU-TA',
 'RU',
 '420066']

['253400XEMNQ5WBPN5635',
 'ул Декабристов, дом 1',
 None,
 None,
 None,
 'Казань',
 'RU-TA',
 'RU',
 '420066']

Processing Records: 100%|██████████| 100000/100000 [00:02<00:00, 36774.22record/s]
Processing Records:   0%|          | 0/100000 [00:00<?, ?record/s]

Assert False legal_address_data


['213800U6Q41HMWN2VX61',
 'Queensgate House',
 '113 South Church Street',
 None,
 None,
 'George town',
 None,
 'KY',
 'KY1-1102']

['213800U6Q41HMWN2VX61',
 'P. O. Box 1093 Queensgate House',
 '113 South Church Street',
 None,
 None,
 'George town',
 None,
 'KY',
 'KY1-1102']

Processing Records:  33%|███▎      | 32598/100000 [00:00<00:01, 41350.13record/s]

Assert False legal_address_data


['213800ZB5IYL8NS49A91',
 '357a Amhurst Road',
 None,
 None,
 None,
 'London',
 None,
 'GB',
 'N16 7UX']

['213800ZB5IYL8NS49A91',
 '28 Lordship Park 28 Lordship Park',
 None,
 None,
 None,
 'London',
 'GB-LND',
 'GB',
 'N16 5UD']

Processing Records:  50%|█████     | 50049/100000 [00:01<00:01, 41357.48record/s]

Assert False legal_address_data


['233000K4D2FT0PTGEV30',
 'Ipoly utca 8. 6. em. 9. ajtó',
 None,
 None,
 None,
 'Budapest',
 'HU-BU',
 'HU',
 '1133']

['233000K4D2FT0PTGEV30',
 'Podmaniczky utca 57. 2. em. 14. ajtó',
 None,
 None,
 None,
 'Budapest',
 'HU-BU',
 'HU',
 '1064']

Processing Records: 100%|██████████| 100000/100000 [00:02<00:00, 42103.83record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 66592.36it/s]
Processing Records: 100%|██████████| 100000/100000 [00:03<00:00, 27809.22record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 58382.62it/s]
Processing Records:   3%|▎         | 3354/100000 [00:00<00:02, 33530.36record/s]

Assert False hq_data


['25490084TGNK392DPW06',
 'UNIT 501 AND 502 5TH FLOOR BUILDING 1B ECOSPACE SARJAPURA MARATHAHALLI',
 'OUTER RING ROAD',
 None,
 None,
 'Bangalore',
 'IN-KA',
 'IN',
 '560103']

['25490084TGNK392DPW06',
 'UNIT 501 AND 502, 5TH FLOOR, BUILDING 1B, ECOSPACE SARJAPURA, MARATHAHALLI',
 'OUTER RING ROAD',
 None,
 None,
 'Bangalore',
 'IN-KA',
 'IN',
 '560103']

Processing Records:  55%|█████▌    | 55070/100000 [00:01<00:01, 40620.25record/s]

Assert False hq_data


['254900GRKDGG2HM83485',
 'Måkekollen 206',
 None,
 None,
 None,
 'KRÅKERØY',
 'NO-30',
 'NO',
 '1679']

['254900GRKDGG2HM83485',
 'Einerbærveien 13',
 None,
 None,
 None,
 'FREDRIKSTAD',
 'NO-30',
 'NO',
 '1615']

Processing Records: 100%|██████████| 100000/100000 [00:02<00:00, 40373.14record/s]
Processing Records:  57%|█████▋    | 57383/100000 [00:01<00:00, 49771.91record/s]

Assert False legal_address_data


['254900GRKDGG2HM83485',
 'Måkekollen 206',
 None,
 None,
 None,
 'KRÅKERØY',
 'NO-30',
 'NO',
 '1679']

['254900GRKDGG2HM83485',
 'Einerbærveien 13',
 None,
 None,
 None,
 'FREDRIKSTAD',
 'NO-30',
 'NO',
 '1615']

Processing Records: 100%|██████████| 100000/100000 [00:02<00:00, 49896.67record/s]
Processing records: 100%|██████████| 100000/100000 [00:01<00:00, 72385.91it/s]


In [3]:
obj_scraper_helpers = Scraper_helpers.Scraper_Helpers()
obj_backfill_helpers = GLEIF_Backfill_Helpers.GLEIF_Backill_Helpers()
obj_testing_level_1_data = Testing_Level_1_data()

list_leis = obj_testing_level_1_data.get_lei_ids()

In [None]:
display(len(list_leis))

In [None]:
all_data = obj_testing_level_1_data.get_lei_data_limit(str_table_name = 'gleif_entity_data', max_rows=100000)
display(all_data[0])

In [None]:
display(list_leis[0])

In [None]:
list_db_data = obj_testing_level_1_data.get_lei_data_limit(str_table_name = "gleif_other_legal_names")
display(list_db_data)

In [None]:
display(obj_testing_level_1_data.get_dict_map(list_input = list_db_data))
dict_db_data = obj_testing_level_1_data.get_dict_map(list_input = list_db_data)

In [None]:
display(dict_db_data["2138001DVAHXYVS7AM84"])



In [7]:
def testing_level_1_data(subset_size = 100000):
    list_leis = obj_testing_level_1_data.get_lei_ids()
    
    list_batched = obj_testing_level_1_data.list_subset_create(list_input = list_leis , batch_size = subset_size)
    #int_total_num_subsets = math.ceil(len(list_leis)/list_batched)
    
    for list_leis in list_batched:
        list_all_data = obj_testing_level_1_data.multithread_lei_batches(list_leis = list_leis , batch_size = 200 , max_workers = 5) 
        list_all_data_flattened = [obj_backfill_helpers.flatten_dict(dict_input = dict_data) for dict_data in list_all_data]
        list_all_data_flattened_sorted = obj_testing_level_1_data.sort_api_data(list_leis = list_leis , list_unsorted = list_all_data_flattened)
        obj_testing_level_1_data.helper_test_gleif_entity_data(list_dict_flat = list_all_data_flattened_sorted)
        obj_testing_level_1_data.helper_test_other_names_data(list_dict_flat = list_all_data_flattened_sorted)
        obj_testing_level_1_data.helper_test_headquarters(list_dict_flat = list_all_data_flattened_sorted)
        obj_testing_level_1_data.helper_test_legal_address(list_dict_flat = list_all_data_flattened_sorted)
        obj_testing_level_1_data.helper_test_registration(list_dict_flat = list_all_data_flattened_sorted)
        obj_testing_level_1_data.helper_test_legal_events(list_dict_flat = list_all_data_flattened_sorted)

In [17]:
list_all_data = obj_testing_level_1_data.multithread_lei_batches(list_batched_leis = list_leis[0:100000] , batch_size = 200 , max_workers = 5)

In [None]:
display(len(list_all_data))

In [None]:
list_all_data_flattened = [obj_backfill_helpers.flatten_dict(dict_input = dict_data) for dict_data in list_all_data]
list_all_data_flattened_sorted = obj_testing_level_1_data.sort_api_data(list_leis = list_leis[0:100000] , list_unsorted = list_all_data_flattened)

In [9]:
import pickle

In [245]:
with open("Dict_flat.pickle" , "wb") as file:
    pickle.dump(list_all_data_flattened_sorted , file)

In [10]:
with open("Dict_flat.pickle" , "rb") as file:
    list_flat_use_case = pickle.load(file)

In [None]:
display(list_flat_use_case[6])

In [None]:
display(obj_backfill_helpers.extract_event_data(bool_test= True, dict_data = list_flat_use_case[6] , base_keyword = "attributes_entity_eventGroups" , target_keys=["groupType", "status", "type", "effectiveDate", "recordedDate", "validationDocuments"]))

In [None]:
from psycopg2 import sql
from collections import defaultdict

def get_lei_data_for_leis(self, str_table_name, list_leis):
    """
    Fetch all rows from the specified table where 'lei' is in the provided list of LEIs.
    
    Args:
        str_table_name (str): Name of the database table.
        list_leis (list of str): List of LEI IDs to fetch records for.
    
    Returns:
        list of lists: Each inner list represents a row from the table, excluding the first column.
    """
    if not list_leis:
        return []

    # Establish connection
    conn = psycopg2.connect(
        dbname="GLEIF_test_db",
        user="Matthew_Pisinski",
        password="matt1",
        host="localhost",
        port="5432"
    )
    cursor = conn.cursor()

    # Dynamically create placeholders based on the number of LEIs
    placeholders = ','.join(['%s'] * len(list_leis))
    query = sql.SQL("SELECT * FROM {table} WHERE lei IN ({placeholders}) ORDER BY lei;").format(
        table=sql.Identifier(str_table_name),
        placeholders=sql.SQL(placeholders)
    )

    # Execute the query with the list of LEIs as parameters
    cursor.execute(query, list_leis)

    results = cursor.fetchall()

    # Convert each row to a list and exclude the first column if needed
    all_data = [list(row)[1:] for row in results]


    cursor.close()
    conn.close()

    return all_data
