In [1]:
import pandas as pd
import numpy as np

import sys
import os

# Add the project root (one level up from notebooks/) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from scripts.utils.dataloaders import (
    load_clarity_data,
    load_crossreference,
    load_overrides,
    load_aladdin_data
)
from scripts.utils.config import get_config
from scripts.utils.clarity_data_quality_control_functions import get_issuer_level_df

In [3]:
# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config(
    "notebook-datafeeds-explorer", 
    interactive=False,
    auto_date= False,
    fixed_date= "202505",)


logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]
# Use the paths from config
DF_PREV_PATH = paths["PRE_DF_WOVR_PATH"]
DF_NEW_PATH = paths["CURRENT_DF_WOUTOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]
#COMMITTEE_PATH = paths["COMMITTEE_PATH"]
## Define the output directory and file based on the configuration.
#OUTPUT_DIR = config["OUTPUT_DIR"]
#OUTPUT_FILE = OUTPUT_DIR / f"{DATE}_pre_ovr_analysis.xlsx"

In [4]:
target_permid = [
"5073622246",
"4295875200",
"4296393129",
"4296978549",
"5041079662",
]
target_cols = ["permid", "aladdin_id","issuer_name"]
targat_cols_brs = ["aladdin_id", "issuer_name"]
target_aladdin_id = [
    "000375",
    "003001",
    "007699",
    "010199",
    "055262",
    "059456",
    "072730",
    "M58534",
    "R48483",
    "F05671"]

In [5]:
datafeed = load_clarity_data(DF_NEW_PATH)
datafeed.loc[:,"permid"] = datafeed["permid"].astype(str)

2025-05-19 10:45:45,413 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250501_Equities_feed_IssuerLevel_sinOVR.csv
2025-05-19 10:45:50,822 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250501_Equities_feed_IssuerLevel_sinOVR.csv


In [6]:
for c in datafeed.columns:
    if c in ["aladdin_id","permid"]:
        print(f"{c}")

permid


In [7]:
brs_carteras = get_issuer_level_df(load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_carteras"), "aladdin_id")
brs_benchmarks = get_issuer_level_df(load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_benchmarks"), "aladdin_id")

2025-05-19 10:45:50,864 - scripts.utils.dataloaders - INFO - Loading portfolio_carteras data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202505_strategies_snt world_portf_bmks.xlsx
2025-05-19 10:46:12,310 - scripts.utils.dataloaders - INFO - Cleaning columns and converting data types for portfolio_carteras
2025-05-19 10:46:12,312 - scripts.utils.dataloaders - INFO - Converting column 'aladdin_id' to string.
2025-05-19 10:46:12,319 - scripts.utils.dataloaders - INFO - Converting column 'portfolio_id' to string.
2025-05-19 10:46:12,322 - scripts.utils.dataloaders - INFO - Successfully loaded Aladdin data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202505_strategies_snt world_portf_bmks.xlsx
2025-05-19 10:46:12,340 - scripts.utils.dataloaders - INFO - Loading portfolio_benchmarks data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_p

In [8]:
for df in [brs_carteras, brs_benchmarks]:
    df.loc[:,"aladdin_id"] = df["aladdin_id"].astype(str)

In [9]:
brs_carteras_filtered = brs_carteras[brs_carteras.aladdin_id.isin(target_aladdin_id)][targat_cols_brs]
brs_benchmarks_filtered = brs_benchmarks[brs_benchmarks.aladdin_id.isin(target_aladdin_id)][targat_cols_brs]

In [10]:
for df_name, df in zip(["carteras","benchamrs"],[brs_carteras_filtered, brs_benchmarks_filtered]):
    print(f"Dataframe: {df_name}'s columns:\n{df.info()}\n\n")

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 165 to 68345
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   aladdin_id   10 non-null     object
 1   issuer_name  10 non-null     object
dtypes: object(2)
memory usage: 240.0+ bytes
Dataframe: carteras's columns:
None


<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 72 to 32267
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   aladdin_id   10 non-null     object
 1   issuer_name  10 non-null     object
dtypes: object(2)
memory usage: 240.0+ bytes
Dataframe: benchamrs's columns:
None




In [11]:
raw_crossreference = load_crossreference(CROSSREFERENCE_PATH)
raw_crossreference.loc[:,"aladdin_id"] = raw_crossreference["aladdin_id"].astype(str)
raw_crossreference.loc[:,"permid"] = raw_crossreference["permid"].astype(str)

2025-05-19 10:47:28,940 - scripts.utils.dataloaders - INFO - Loading crossreference data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501.csv
2025-05-19 10:47:29,243 - scripts.utils.dataloaders - INFO - Cleaning columns and renaming crossreference data
2025-05-19 10:47:29,245 - scripts.utils.dataloaders - INFO - Successfully loaded crossreference from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501.csv


In [12]:
cross_filtered = raw_crossreference[raw_crossreference.aladdin_id.isin(target_aladdin_id)]
cross_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 4180 to 133848
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   aladdin_id   10 non-null     object
 1   issuer_name  10 non-null     object
 2   permid       10 non-null     object
 3   msci         10 non-null     object
 4   sust         9 non-null      object
dtypes: object(5)
memory usage: 480.0+ bytes


In [13]:
cross_filtered.head()

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
4180,375,BP PLC,4295894740,IID000000002140371,1008272712
4193,3001,AB SKF,4295890078,IID000000002144268,1008126516
4195,7699,BANCO SANTANDER SA,8589934205,IID000000002159005,1008202176
4202,10199,AKZO NOBEL NV,4295884730,IID000000002156536,1008204089
4273,55262,BASF SE,4295869198,IID000000002158653,1008261072


In [14]:
cross_aladdin_path = r"C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501_aladdin.csv"
cross_aladdin = pd.read_csv(cross_aladdin_path)

  cross_aladdin = pd.read_csv(cross_aladdin_path)


In [15]:
cross_aladdin.columns = cross_aladdin.columns.str.lower()
cross_aladdin.rename(columns={"aladdin_issuer":"aladdin_id", "clarity_ai":"permid"}, inplace=True)

In [16]:
cross_aladdin_filtered = cross_aladdin[cross_aladdin.aladdin_id.isin(target_aladdin_id)]

In [17]:
cross_aladdin_filtered

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
443,F05671,TESLA INC (PRE-REINCORPORATION),4297089638,IID000000005574410,
601,M58534,CVC CAPITAL PARTNERS PLC,5082516729,IID000000005133104,2011649000.0
54496,R48483,ACCOR SA,4295866829,IID000000002148785,1008527000.0


In [18]:
cross_aladdin.head()

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
0,Z27537,ARUNA ENERGIAS RENOVAVEIS LTDA,5090656669,IID000000006215085,
1,Z27359,ALAMOS GOLD INC,5093618813,IID000000006213797,
2,M83996,ALAMOS GOLD INC,5090737325,IID000000006213797,1008762000.0
3,Z27264,PNC GOMTI HIGHWAYS PVT LTD,5079209310,IID000000006213522,
4,Z27272,PNC BITHUR KANPUR HIGHWAYS PVT LTD,5079222200,IID000000006213486,


In [19]:
crossreference = raw_crossreference.drop_duplicates(subset=["permid"]).dropna(subset=["permid"])


In [20]:
crossreference.info()

<class 'pandas.core.frame.DataFrame'>
Index: 131889 entries, 0 to 134880
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   aladdin_id   131889 non-null  object
 1   issuer_name  131889 non-null  object
 2   permid       131889 non-null  object
 3   msci         62070 non-null   object
 4   sust         22592 non-null   object
dtypes: object(5)
memory usage: 6.0+ MB


In [21]:
datafeed = datafeed.merge(crossreference[["aladdin_id","permid"]], how="left", on="permid")
datafeed.loc[:,"aladdin_id"] = datafeed["aladdin_id"].astype(str)

In [22]:
for i, name in enumerate(datafeed.columns):
    if i < 5:
        print(f"{name}")

isin
instrument_type
issuer_name
issuer_country
gics2_industry


In [23]:
for c in datafeed.columns:
    if c in ["aladdin_id","permid"]:
        print(f"{c}")

permid
aladdin_id


In [24]:
for c in datafeed.columns:
    if "str_00" in c:
        print(c)

str_001_s
str_002_ec
str_003_ec
str_004_asec
str_005_ec
str_006_sec
str_007_sect
str_003b_ec


In [25]:
filtered_df = datafeed[(datafeed["aladdin_id"].isin(target_aladdin_id)) | (datafeed["permid"].isin(target_permid))][target_cols]

In [26]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 1340 to 35819
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   permid       8 non-null      object
 1   aladdin_id   8 non-null      object
 2   issuer_name  8 non-null      object
dtypes: object(3)
memory usage: 256.0+ bytes
