In [1]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
# Add the project
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
from scripts.utils.dataloaders import load_crossreference, load_clarity_data
from scripts.utils.config import get_config

In [4]:
config = get_config(
    "notebook-crossreference-explorer", 
    interactive=False,
    auto_date= False,
    fixed_date="202505")

logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
paths = config["paths"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
DF_PATH = paths["CURRENT_DF_WOUTOVR_PATH"]

In [5]:
target_cols_clarity = ["permid", "issuer_name"]
target_cols_brs = ["aladdin_id", "issuer_name"]
target_cols_crossreference = ["aladdin_id", "permid", "issuer_name"]
target_aladdin_id_erquest = [
    "000375",
    "003001",
    "007699",
    "010199",
    "055262",
    "059456",
    "072730",
    "M58534",
    "R48483",
    "F05671"
]
target_permid_request = [
    "5073622246",
    "4295875200", 
    "4296393129", 
    "4296978549",
    "5041079662",
]

In [6]:
datafeed = load_clarity_data(DF_PATH, target_cols=target_cols_clarity)

2025-05-19 16:01:02,207 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250501_Equities_feed_IssuerLevel_sinOVR.csv
2025-05-19 16:01:02,996 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250501_Equities_feed_IssuerLevel_sinOVR.csv


In [7]:
datafeed_permid_list = list(set(datafeed["permid"].unique()))

In [8]:
# let's load brs data
path_snt_world = Path(r"C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\snt_world_sntcor_corp_shares.xlsx")
snt_world_bmk = pd.read_excel(path_snt_world, sheet_name="portfolio_benchmarks", skiprows=3)
snt_world_ptf = pd.read_excel(path_snt_world, sheet_name="portfolio_carteras", skiprows=3)

In [9]:
# let's clean and filter the carteras and benchmark data
snt_word_dfs_list = [snt_world_bmk, snt_world_ptf]

snt_world_cleaned_list = []
common_cols = ["aladdin_id", "issuer_name"]
target_cols_bmk = common_cols + ["benchmark_id", "sntcore_share_corps_flag"]
target_cols_ptf = common_cols + ["portfolio_id", "sntcore_share_corps_flag"]
for df in snt_word_dfs_list:
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    df.dropna(subset=["aladdin_id"], inplace=True)
    if "portfolio_id" in df.columns:
        df.rename(columns={
            "sntcore_share_corps_flag":"sntcore_share_corps_flag_bool",
            "snt_core_breakdown_-_aligned_with_compliance_(sntcore_compl)_-_level_2":"sntcore_share_corps_flag"
            }, inplace=True)
        df_filtered = df[target_cols_ptf]
        df_filtered = df_filtered[df_filtered["sntcore_share_corps_flag"].isin(['Corporates' 'Shares'])]
        snt_world_cleaned_list.append(df_filtered)
    else:
        df_filtered = df[target_cols_bmk]
        df_filtered = df_filtered[df_filtered["sntcore_share_corps_flag"].isin(['Corporates' 'Shares'])]
        snt_world_cleaned_list.append(df_filtered)


snt_world_bmk_cleaned, snt_world_pth_cleaned = snt_world_cleaned_list
del snt_word_dfs_list

In [10]:
snt_world_pth_cleaned

Unnamed: 0,aladdin_id,issuer_name,portfolio_id,sntcore_share_corps_flag


In [11]:
for df in snt_world_cleaned_list:
    print(f"{df.columns}")
    print("\n")

Index(['aladdin_id', 'issuer_name', 'benchmark_id',
       'sntcore_share_corps_flag'],
      dtype='object')


Index(['aladdin_id', 'issuer_name', 'portfolio_id',
       'sntcore_share_corps_flag'],
      dtype='object')




In [12]:
for df in snt_world_cleaned_list:
    print(f"{df.sntcore_share_corps_flag.unique()}")
    print("\n")

[]


[]




In [13]:
# let's save the aladdin_ids of portfolios and benchmarks into a list
snt_world_aladdin_id_list = list(
    set(list(snt_world_bmk_cleaned.aladdin_id.unique()) + list(snt_world_pth_cleaned.aladdin_id.unique()))
    )

In [14]:
print(len(snt_world_aladdin_id_list))
print(len(datafeed_permid_list))

0
73401


In [15]:
crossreference_dir_path = Path(r"C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference")
cross_202501_path = crossreference_dir_path / "Aladdin_Clarity_Issuers_20250501.csv"
cross_202502_path = crossreference_dir_path / "Aladdin_Clarity_Issuers_20250201.csv"
cross_202503_path = crossreference_dir_path / "Aladdin_Clarity_Issuers_20250301.csv"
cross_202504_path = crossreference_dir_path / "Aladdin_Clarity_Issuers_20250401.csv"
cross_202505_path = crossreference_dir_path / "Aladdin_Clarity_Issuers_20250501.csv"
cross_brs_oneoff = crossreference_dir_path / "Aladdin_Clarity_Issuers_brs_oneoff.csv"

In [16]:
crossreference = load_crossreference(CROSSREFERENCE_PATH)
cross_202501 = load_crossreference(cross_202501_path)
cross_202502 = load_crossreference(cross_202502_path)
cross_202503 = load_crossreference(cross_202503_path)
cross_202504 = load_crossreference(cross_202504_path)
cross_202505 = load_crossreference(cross_202505_path)
cross_brs_oneoff = load_crossreference(cross_brs_oneoff)

2025-05-19 16:01:55,125 - scripts.utils.dataloaders - INFO - Loading crossreference data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501.csv
2025-05-19 16:01:55,594 - scripts.utils.dataloaders - INFO - Cleaning columns and renaming crossreference data
2025-05-19 16:01:55,597 - scripts.utils.dataloaders - INFO - Successfully loaded crossreference from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501.csv
2025-05-19 16:01:55,602 - scripts.utils.dataloaders - INFO - Loading crossreference data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501.csv
2025-05-19 16:01:56,024 - scripts.utils.dataloaders - INFO - Cleaning columns and renaming crossreference data
2025-05-19 16:01:56,027 - scripts.utils.dataloaders - INFO - Successfully loaded cro

In [17]:
cross_df_list = [
    cross_202501,
    cross_202502,
    cross_202503,
    cross_202504,
    cross_202505,
    cross_brs_oneoff
]

In [18]:
crosserefrence_dictionaries = {
    "cross_202501" : {
        "cross_df": cross_202501,
        "set_permid": set(cross_202501["permid"].unique()),
        "set_aladdin_id": set(cross_202501["aladdin_id"].unique()),
    },
    "cross_202502" : {
        "cross_df": cross_202502,
        "set_permid": set(cross_202502["permid"].unique()),
        "set_aladdin_id": set(cross_202502["aladdin_id"].unique()),
    },
    "cross_202503" : {
        "cross_df": cross_202503,
        "set_permid": set(cross_202503["permid"].unique()),
        "set_aladdin_id": set(cross_202503["aladdin_id"].unique()),
    },
    "cross_202504" : {
        "cross_df": cross_202504,
        "set_permid": set(cross_202504["permid"].unique()),
        "set_aladdin_id": set(cross_202504["aladdin_id"].unique()),
    },
    "cross_202505" : {
        "cross_df": cross_202505,
        "set_permid": set(cross_202505["permid"].unique()),
        "set_aladdin_id": set(cross_202505["aladdin_id"].unique()),
    },
    "cross_brs_oneoff" : {
        "cross_df": cross_brs_oneoff,
        "set_permid": set(cross_brs_oneoff["permid"].unique()),
        "set_aladdin_id": set(cross_brs_oneoff["aladdin_id"].unique()),
    }

}

datafeed_permid_set = set(datafeed_permid_list)
snt_world_aladdin_id_set = set(snt_world_aladdin_id_list)


for cross_name, cross_dict in crosserefrence_dictionaries.items():
    missing_permid = datafeed_permid_set - cross_dict["set_permid"]
    missing_aladdin_id = snt_world_aladdin_id_set - cross_dict["set_aladdin_id"]

    cross_dict["missing_permid"] = list(missing_permid)
    cross_dict["missing_aladdin_id"] = list(missing_aladdin_id)

    logger.info(f"\nFor the cross referecen of {cross_name.replace("cross_","")} we are missing:")
    logger.info(f"Missing permid in {cross_name}: {len(missing_permid)}")
    logger.info(f"Missing aladdin_id in {cross_name}: {len(missing_aladdin_id)}")
    logger.info("\n")

For the cross referecen of 202501 we are missing:
Missing permid in cross_202501: 18207
Missing aladdin_id in cross_202501: 0


For the cross referecen of 202502 we are missing:
Missing permid in cross_202502: 18271
Missing aladdin_id in cross_202502: 0


For the cross referecen of 202503 we are missing:
Missing permid in cross_202503: 18208
Missing aladdin_id in cross_202503: 0


For the cross referecen of 202504 we are missing:
Missing permid in cross_202504: 18207
Missing aladdin_id in cross_202504: 0


For the cross referecen of 202505 we are missing:
Missing permid in cross_202505: 18207
Missing aladdin_id in cross_202505: 0


For the cross referecen of brs_oneoff we are missing:
Missing permid in cross_brs_oneoff: 8147
Missing aladdin_id in cross_brs_oneoff: 0


