In [1]:
import pandas as pd
import numpy as np

import sys
import os

# Add the project root (one level up from notebooks/) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from scripts.utils.dataloaders import (
    load_clarity_data,
    load_crossreference,
    load_overrides,
    load_aladdin_data
)
from scripts.utils.config import get_config
from scripts.utils.clarity_data_quality_control_functions import get_issuer_level_df

In [3]:
# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config(
    "notebook-datafeeds-explorer", 
    interactive=False,
    auto_date= False,
    fixed_date= "202506",)


logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]
# Use the paths from config
DF_PREV_PATH = paths["PRE_DF_WOVR_PATH"]
DF_NEW_PATH = paths["CURRENT_DF_WOUTOVR_PATH"]
DF_NEW_WITH_OVR_PATH = paths["NEW_DF_WOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]
#COMMITTEE_PATH = paths["COMMITTEE_PATH"]
## Define the output directory and file based on the configuration.
#OUTPUT_DIR = config["OUTPUT_DIR"]
#OUTPUT_FILE = OUTPUT_DIR / f"{DATE}_pre_ovr_analysis.xlsx"

In [4]:
target_permid = [
"5073622246",
"4295875200",
"4296393129",
"4296978549",
"5041079662",
]
target_cols = ["permid", "aladdin_id","issuer_name"]
targat_cols_brs = ["aladdin_id", "issuer_name"]
target_aladdin_id = [
    "J66430"]
strategy_cols = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "str_006_sec",
    "str_007_sect",
    "str_sfdr8_aec",
    "scs_001_sec",
    "scs_002_ec",
]

clarity_test_col = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "cs_001_sec",
    "cs_002_ec",
]

output_cols = target_cols + strategy_cols
datafeet_output_cols = target_cols + clarity_test_col

In [5]:
target_permids_may = [ pid for pid in
    set(
        ["5080149298",
        "5034819615",
        "5080149298",
        "5080149298",
        "5080154505",
        "5080149298",
        "5083858869",
        "5080149298",
        "5080154505",
        "5083858869",
        "4296397806",
        "5083858869",
        "5080149298",
        "5080149298",
        "5080154505",]
    )
]

In [6]:
print(target_permids_may)

['5083858869', '4296397806', '5034819615', '5080149298', '5080154505']


In [7]:
def find_conflicting_permid(df: pd.DataFrame) -> pd.DataFrame:
    # Step 1: Count unique permids per aladdin_id
    permid_counts = df.groupby("aladdin_id")["permid"].nunique()

    # Step 2: Filter aladdin_ids with more than one unique permid
    conflicting_ids = permid_counts[permid_counts > 1].index

    # Step 3: Return all rows in df where aladdin_id is in the list
    return df[df["aladdin_id"].isin(conflicting_ids)]

In [8]:
datafeed_without_ovr = load_clarity_data(DF_NEW_PATH)
datafeed_with_ovr = load_clarity_data(DF_NEW_WITH_OVR_PATH)
datafeed = load_clarity_data(DF_NEW_PATH)
datafeed.loc[:,"permid"] = datafeed["permid"].astype(str)

2025-06-03 12:27:55,236 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\datafeeds_without_ovr\2025\20250601_df_issuer_level_without_ovr.csv
2025-06-03 12:27:57,950 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\datafeeds_without_ovr\2025\20250601_df_issuer_level_without_ovr.csv
2025-06-03 12:27:57,951 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\datafeeds_with_ovr\202506_df_issuer_level_with_ovr.csv
2025-06-03 12:28:00,751 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\datafeeds_with_ovr\202506_df_issuer_level_with_ovr.csv
2025-06-03 12:28:00,754 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\data

In [9]:
for c in datafeed.columns:
    if c in ["aladdin_id","permid"]:
        print(f"{c}")

permid


In [33]:
#brs_carteras = get_issuer_level_df(load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_carteras"), "aladdin_id")
#brs_benchmarks = get_issuer_level_df(load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_benchmarks"), "aladdin_id")

#for df in [brs_carteras, brs_benchmarks]:
#    df.loc[:,"aladdin_id"] = df["aladdin_id"].astype(str)

#brs_carteras_filtered = brs_carteras[brs_carteras.aladdin_id.isin(target_aladdin_id)][targat_cols_brs]
#brs_benchmarks_filtered = brs_benchmarks[brs_benchmarks.aladdin_id.isin(target_aladdin_id)][targat_cols_brs]

#for df_name, df in zip(["carteras","benchamrs"],[brs_carteras_filtered, brs_benchmarks_filtered]):
#    print(f"Dataframe: {df_name}'s columns:\n{df.info()}\n\n")

In [10]:
raw_crossreference = load_crossreference(CROSSREFERENCE_PATH)
raw_crossreference.loc[:,"aladdin_id"] = raw_crossreference["aladdin_id"].astype(str)
raw_crossreference.loc[:,"permid"] = raw_crossreference["permid"].astype(str)

2025-06-03 12:28:12,999 - scripts.utils.dataloaders - INFO - Loading crossreference data from: C:\Users\n740789\Documents\esg-sri-repos\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250601.csv
2025-06-03 12:28:13,203 - scripts.utils.dataloaders - INFO - Cleaning columns and renaming crossreference data
2025-06-03 12:28:13,206 - scripts.utils.dataloaders - INFO - Successfully loaded crossreference from: C:\Users\n740789\Documents\esg-sri-repos\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250601.csv


In [11]:
cross_duplicated_permid = find_conflicting_permid(raw_crossreference)

In [12]:
cross_duplicated_permid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38 entries, 9658 to 120713
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   aladdin_id   38 non-null     object
 1   issuer_name  38 non-null     object
 2   permid       38 non-null     object
 3   msci         22 non-null     object
 4   sust         4 non-null      object
dtypes: object(5)
memory usage: 1.8+ KB


In [13]:
cross_duplicated_permid.head()

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
9658,C06438,DAEKYO CO LTD,4295882050,IID000000002163522,
9659,C06438,DAEKYO CO LTD,5040055894,IID000000002163522,
16991,C85817,NATIONAL INSURANCE CO LTD,5000004308,,
16992,C85817,NATIONAL INSURANCE CO LTD,5000074128,,
30335,E91812,WITHUS CO LTD,4295882459,,


In [14]:
#cross_duplicated_permid.to_csv(r"C:\Users\n740789\Downloads\aladdin_issuers_with_multiple_permids.csv", index=False)

In [15]:
cross_filtered = raw_crossreference[raw_crossreference.aladdin_id.isin(target_aladdin_id)]
cross_filtered.head(10)

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
116602,J66430,ABBOTT IRELAND FINANCING DAC,5065365677,IID000000002923125,2006365832


In [16]:
cross_filtered.head()

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
116602,J66430,ABBOTT IRELAND FINANCING DAC,5065365677,IID000000002923125,2006365832


In [17]:
#cross_aladdin_path = r"C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250501_aladdin.csv"
#cross_aladdin = pd.read_csv(cross_aladdin_path)
#cross_aladdin.columns = cross_aladdin.columns.str.lower()
#cross_aladdin.rename(columns={"aladdin_issuer":"aladdin_id", "clarity_ai":"permid"}, inplace=True)
# cross_aladdin_filtered = cross_aladdin[cross_aladdin.aladdin_id.isin(target_aladdin_id)]

In [18]:
crossreference = raw_crossreference.drop_duplicates(subset=["permid"]).dropna(subset=["permid"])


In [19]:
crossreference.info()

<class 'pandas.core.frame.DataFrame'>
Index: 131889 entries, 0 to 134880
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   aladdin_id   131889 non-null  object
 1   issuer_name  131889 non-null  object
 2   permid       131889 non-null  object
 3   msci         62070 non-null   object
 4   sust         22592 non-null   object
dtypes: object(5)
memory usage: 6.0+ MB


In [20]:
print(datafeed.columns)

Index(['isin', 'instrument_type', 'issuer_name', 'issuer_country',
       'gics2_industry', 'region', 'company_inheriting', 'parent_company',
       'esg_score', 'esg_score_relevance',
       ...
       'maxexp_gas_fuels_prod', 'exp_gas_fuels_part', 'minexp_gas_fuels_part',
       'maxexp_gas_fuels_part', 'exp_oil_fuels_prod', 'minexp_oil_fuels_prod',
       'maxexp_oil_fuels_prod', 'exp_oil_fuels_part', 'minexp_oil_fuels_part',
       'maxexp_oil_fuels_part'],
      dtype='object', length=276)


In [23]:
datafeed = datafeed.merge(crossreference[["aladdin_id","permid"]], how="left", on="permid")


In [24]:
datafeed.loc[:,"aladdin_id"] = datafeed["aladdin_id"].astype(str)

In [25]:
datafeed_target_may = datafeed[datafeed.aladdin_id.isin(target_aladdin_id)][datafeet_output_cols].copy()

In [26]:
datafeed_target_may.head()

Unnamed: 0,permid,aladdin_id,issuer_name,str_001_s,str_002_ec,str_003_ec,str_003b_ec,str_004_asec,str_005_ec,str_006_sec,str_007_sect,art_8_basicos,cs_001_sec,cs_002_ec
36869,5065365677,J66430,Abbott Ireland Financing DAC,FLAG,EXCLUDED,FLAG,OK,FLAG,OK,FLAG,OK,FLAG,OK,OK


In [49]:
datafeed_target_may.to_csv(r"C:\Users\n740789\Downloads\failed_sambau_upload_issuer.csv", index=False)

In [50]:
for i, name in enumerate(datafeed.columns):
    if i < 5:
        print(f"{name}")

isin
instrument_type
issuer_name
issuer_country
gics2_industry


In [51]:
for c in datafeed.columns:
    if c in ["aladdin_id","permid"]:
        print(f"{c}")

permid
aladdin_id


In [52]:
for c in datafeed.columns:
    if "str_00" in c:
        print(c)

str_001_s
str_002_ec
str_003_ec
str_004_asec
str_005_ec
str_006_sec
str_007_sect
str_003b_ec


In [53]:
filtered_df = datafeed[(datafeed["aladdin_id"].isin(target_aladdin_id)) | (datafeed["permid"].isin(target_permid))][target_cols]

In [54]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 52806 to 66002
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   permid       2 non-null      object
 1   aladdin_id   2 non-null      object
 2   issuer_name  2 non-null      object
dtypes: object(3)
memory usage: 64.0+ bytes
