In [1]:
import sys
import os
import warnings
from pathlib import Path
from typing import List, Tuple
from itertools import chain

import numpy as np
import pandas as pd

from utils.dataloaders import (
    load_clarity_data,
    load_aladdin_data,
    load_crossreference,
    load_portfolios,
    load_overrides,
)
from utils.zombie_killer import main as zombie_killer

# Import the centralized configuration
from config import get_config

Invalid date format. Please use YYYYMM.
Invalid date format. Please use YYYYMM.
2025-03-24 23:21:04,953 - utils.get_date - INFO - Date format is valid. Date set to 202503.
Output directory is set to: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\zombie_list


In [2]:
# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config("pre-ovr-analysis", interactive=False)
logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]

# Use the paths from config
df_1_path = paths["PRE_DF_WOVR_PATH"]
df_2_path = paths["CURRENT_DF_WOUTOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]
COMMITTEE_PATH = paths["COMMITTEE_PATH"]

# Define the output directory and file based on the configuration.
OUTPUT_DIR = config["OUTPUT_DIR"]
OUTPUT_FILE = OUTPUT_DIR / f"{DATE}_pre_ovr_analysis.xlsx"

2025-03-24 23:21:09,226 - utils.get_date - INFO - Date format is valid. Date set to 202503.
Output directory is set to: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\pre-ovr-analysis


In [3]:
# Ignore workbook warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [4]:
# check that the date constants are set correctly
print(f"{DATE} and {YEAR} and {DATE_PREV}.")

202503 and 2025 and 202502.


In [5]:
# DEFINE TEST COLUMNS
test_col = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "gp_esccp",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30",
    "art_8_basicos",
    "str_003b_ec",
]
columns_to_read = ["permid", "isin", "issuer_name"] + test_col

Define Functions

In [6]:
def prepare_dataframes(
    df1: pd.DataFrame, df2: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Prepare DataFrames by setting the index and filtering for common indexes.
    Logs info about common, new, and missing indexes.
    """
    # Set index to 'permid' if it exists, otherwise assume it's already the index.
    if "permid" in df1.columns:
        df1 = df1.set_index("permid")
    else:
        logger.warning("df1 does not contain a 'permid' column. Using current index.")

    if "permid" in df2.columns:
        df2 = df2.set_index("permid")
    else:
        logger.warning("df2 does not contain a 'permid' column. Using current index.")

    common_indexes = df1.index.intersection(df2.index)
    new_indexes = df2.index.difference(df1.index)
    missing_indexes = df1.index.difference(df2.index)

    logger.info(f"Number of common indexes: {len(common_indexes)}")

    return (
        df1.loc[common_indexes],
        df2.loc[common_indexes],
        df2.loc[new_indexes],
        df1.loc[missing_indexes],
    )

In [7]:

def compare_dataframes(
    df1: pd.DataFrame, df2: pd.DataFrame, test_col: List[str]
) -> pd.DataFrame:
    """Compare DataFrames and create a delta DataFrame."""
    delta = df2.copy()
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Comparing column: {col}")
            diff_mask = df1[col] != df2[col]
            delta.loc[~diff_mask, col] = np.nan
    return delta


def get_exclusion_list(
    row: pd.Series, df1: pd.DataFrame, test_col: List[str]
) -> List[str]:
    """Get list of columns that changed to EXCLUDED."""
    return [
        col
        for col in test_col
        if row[col] == "EXCLUDED" and df1.loc[row.name, col] != "EXCLUDED"
    ]


def get_inclusion_list(
    row: pd.Series, df1: pd.DataFrame, test_col: List[str]
) -> List[str]:
    """Get list of columns that changed from EXCLUDED to any other value."""
    return [
        col
        for col in test_col
        if row[col] != "EXCLUDED" and df1.loc[row.name, col] == "EXCLUDED"
    ]


def check_new_exclusions(
    df1: pd.DataFrame, df2: pd.DataFrame, delta: pd.DataFrame, test_col: List[str]
) -> pd.DataFrame:
    """Check for new exclusions and update the delta DataFrame."""
    delta["new_exclusion"] = False
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Checking for new exclusions in column: {col}")
            mask = (df1[col] != "EXCLUDED") & (df2[col] == "EXCLUDED")
            delta.loc[mask, "new_exclusion"] = True
            logger.info(f"Number of new exclusions in {col}: {mask.sum()}")
    delta["exclusion_list"] = delta.apply(
        lambda row: get_exclusion_list(row, df1, test_col), axis=1
    )
    return delta


def check_new_inclusions(
    df1: pd.DataFrame, df2: pd.DataFrame, delta: pd.DataFrame, test_col: List[str]
) -> pd.DataFrame:
    """Check for new inclusions and update the delta DataFrame."""
    delta["new_inclusion"] = False
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Checking for new inclusions in column: {col}")
            mask = (df1[col] == "EXCLUDED") & (df2[col] != "EXCLUDED")
            delta.loc[mask, "new_inclusion"] = True
            logger.info(f"Number of new inclusions in {col}: {mask.sum()}")
    delta["inclusion_list"] = delta.apply(
        lambda row: get_inclusion_list(row, df1, test_col), axis=1
    )
    return delta


def finalize_delta(delta: pd.DataFrame, test_col: List[str]) -> pd.DataFrame:
    """Finalize the delta DataFrame by removing unchanged rows and resetting the index."""
    delta = delta.dropna(subset=test_col, how="all")
    delta.reset_index(inplace=True)
    delta["permid"] = delta["permid"].astype(str)
    logger.info(f"Final delta shape: {delta.shape}")
    return delta

In [8]:
# LOAD DATA
columns_to_read = ["permid", "isin", "issuer_name"] + test_col
# clarity data
df_1 = load_clarity_data(df_1_path, columns_to_read)
df_2 = load_clarity_data(df_2_path, columns_to_read)

2025-03-24 23:21:09,410 - utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\202502_df_issuer_level_with_ovr.csv
2025-03-24 23:21:09,937 - utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\202502_df_issuer_level_with_ovr.csv
2025-03-24 23:21:09,939 - utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250301_Equities_feed_IssuerLevel_sinOVR.csv
2025-03-24 23:21:10,642 - utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250301_Equities_feed_IssuerLevel_sinOVR.csv


In [9]:
# aladdin /brs data / perimetros
brs_carteras = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_carteras")    
brs_benchmarks = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_benchmarks")
crosreference = load_crossreference(CROSSREFERENCE_PATH)

2025-03-24 23:21:10,657 - utils.dataloaders - INFO - Loading portfolio_carteras data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202503_strategies_snt world_portf_bmks.xlsx
2025-03-24 23:21:27,356 - utils.dataloaders - INFO - editting column names for portfolio_carteras data
2025-03-24 23:21:27,359 - utils.dataloaders - INFO - Successfully loaded Aladdin data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202503_strategies_snt world_portf_bmks.xlsx
2025-03-24 23:21:27,361 - utils.dataloaders - INFO - Loading portfolio_benchmarks data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202503_strategies_snt world_portf_bmks.xlsx


In [10]:
crosreference.head()

Unnamed: 0,aladdin_id,issuer_name,permid,msci,sust
0,H56976,AUXIFIP SA,5001248970,IID000000002682941,
1,H57042,AVESTA TECHNOLOGIES LLC,4295900331,,
2,H57890,INFANT BACTERIAL THERAPEUTICS AB,5040202605,IID000000002761045,2004151000.0
3,H57901,MB SECURITIES JSC,4298118784,IID000000002761038,
4,H57917,BARCODE 121 HOLDING AS,5050698850,,


In [11]:
# sri/ESG Team data
overrides = load_overrides(OVR_PATH)

loading overrides columns ['clarityid', 'permid', 'brs_id', 'ovr_target', 'ovr_value']
2025-03-24 23:17:04,953 - utils.dataloaders - INFO - Loading overrides from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\overrides\20250318_overrides_db.xlsx


In [12]:
# Load portfolios & benchmarks dicts and lists
(
    portfolios_dict,
    benchmarks_dict,
    carteras_list,
    benchmarks_list,
    carteras_benchmarks_list,
) = load_portfolios(BMK_PORTF_STR_PATH)
logger.info(f"df_1 shape: {df_1.shape}, df_2 shape: {df_2.shape}")

2025-03-24 23:17:05,665 - utils.dataloaders - INFO - Loading portfolios portfolio_carteras from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202503_strategies_snt world_portf_bmks.xlsx
2025-03-24 23:17:23,175 - utils.dataloaders - INFO - Loading benchmarks from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202503_strategies_snt world_portf_bmks.xlsx
2025-03-24 23:18:21,590 - utils.dataloaders - INFO - Converting portfolios and benchmarks to dictionaries
2025-03-24 23:18:23,259 - utils.dataloaders - INFO - Removing 'nan' strings from the lists
2025-03-24 23:18:23,854 - utils.dataloaders - INFO - Creating flat lists for portfolios and benchmarks
2025-03-24 23:18:24,094 - pre-ovr-analysis - INFO - df_1 shape: (69264, 19), df_2 shape: (69278, 19)


In [13]:
# PREPARE DATA
(
    df_1, 
    df_2,
    new_issuer,
    out_issuer,
) = prepare_dataframes(df_1, df_2)

# log size of new and missing issuers
logger.info(f"Number of new issuers: {new_issuer.shape[0]}")
logger.info(f"Number of missing issuers: {out_issuer.shape[0]}")

2025-03-24 23:18:24,167 - pre-ovr-analysis - INFO - Number of common indexes: 69213
2025-03-24 23:18:24,257 - pre-ovr-analysis - INFO - Number of new issuers: 65
2025-03-24 23:18:24,259 - pre-ovr-analysis - INFO - Number of missing issuers: 51


In [14]:
out_issuer.head()


Unnamed: 0_level_0,isin,issuer_name,str_001_s,str_002_ec,str_003_ec,str_004_asec,str_005_ec,cs_001_sec,gp_esccp,cs_003_sec,cs_002_ec,str_006_sec,str_007_sect,gp_esccp_22,gp_esccp_25,gp_esccp_30,art_8_basicos,str_003b_ec
permid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4295857675,AU0000037889,Excelsior Capital Ltd,OK,OK,OK,EXCLUDED,OK,EXCLUDED,OK,EXCLUDED,OK,EXCLUDED,EXCLUDED,OK,OK,OK,OK,OK
4295857792,NO0004253238,Grand Hotel Management Pty Ltd,OK,OK,OK,EXCLUDED,OK,EXCLUDED,OK,EXCLUDED,OK,OK,EXCLUDED,OK,OK,OK,OK,OK
4295875321,IT0001082665,Comau SpA,OK,OK,OK,OK,OK,OK,OK,EXCLUDED,OK,OK,OK,OK,OK,OK,OK,OK
4295878072,JP3762100000,Novarese Inc (Pre-Merger),OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK
4295884500,MA0001006560,Societe Equipement Domestique et Menager SA,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK


In [15]:
for i in sorted(out_issuer.issuer_name.unique()):
    print(i)

AceBiomed Inc
Adani Properties Pvt Ltd
American Resources Offshore  Inc
American Savings Bank FSB
Arch Western Finance LLC
Autostock Inc
Bank Of Madera County
Belron UK Finance PLC
COMAU SCIAKY SA
Central Valley Community Bank
China Huarong Financial Leasing Co Ltd
Comau SpA
Credit Suisse Funds AG
Credit Suisse Private Advisors AG in Liquidation
DG Liquidation Corp
Emerald Haven Town and Country Pvt Ltd
Excelsior Capital Ltd
Express LLC
Folsom Lake Bank
Fundacion Bancaria Caja de Ahorros de Asturias
Grab A Grub Services Pvt Ltd
Grand Hotel Management Pty Ltd
Hertz Fleet Lease Funding LP Series 2016 1
Hitit Bilgisayar Hizmetleri AS
Home Trust Co
Hunter Ridge Holdings Inc
Ignite Epm Live Solutions Inc
Integracao Transmissora de Energia SA
International Energy Group LLC
Jiminy Inc
La Marocaine Vie
Marocaine De Banques Ste Generale SA
Mercado Credito Sociedade de Credito Financiamento e Investimento SA
Novarese Inc (Pre-Merger)
Orbita Funding 2020-1 PLC
Pocheon Power Co Ltd
RHB Securities 

In [16]:
# COMPARE DATA
delta = compare_dataframes(df_1, df_2, test_col)
delta = check_new_exclusions(df_1, df_2, delta, test_col)
delta = check_new_inclusions(df_1, df_2, delta, test_col)
delta = finalize_delta(delta, test_col)


2025-03-24 23:18:24,328 - pre-ovr-analysis - INFO - Comparing column: str_001_s
2025-03-24 23:18:24,343 - pre-ovr-analysis - INFO - Comparing column: str_002_ec
2025-03-24 23:18:24,352 - pre-ovr-analysis - INFO - Comparing column: str_003_ec
2025-03-24 23:18:24,361 - pre-ovr-analysis - INFO - Comparing column: str_004_asec
2025-03-24 23:18:24,368 - pre-ovr-analysis - INFO - Comparing column: str_005_ec
2025-03-24 23:18:24,379 - pre-ovr-analysis - INFO - Comparing column: cs_001_sec
2025-03-24 23:18:24,388 - pre-ovr-analysis - INFO - Comparing column: gp_esccp
2025-03-24 23:18:24,400 - pre-ovr-analysis - INFO - Comparing column: cs_003_sec
2025-03-24 23:18:24,408 - pre-ovr-analysis - INFO - Comparing column: cs_002_ec
2025-03-24 23:18:24,415 - pre-ovr-analysis - INFO - Comparing column: str_006_sec
2025-03-24 23:18:24,427 - pre-ovr-analysis - INFO - Comparing column: str_007_sect
2025-03-24 23:18:24,437 - pre-ovr-analysis - INFO - Comparing column: gp_esccp_22
2025-03-24 23:18:24,504 - 

In [17]:
overrides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7671 entries, 0 to 7670
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clarityid   7671 non-null   object
 1   permid      7671 non-null   object
 2   brs_id      5453 non-null   object
 3   ovr_target  7671 non-null   object
 4   ovr_value   7652 non-null   object
dtypes: object(5)
memory usage: 299.8+ KB


In [18]:
brs_carteras

Unnamed: 0,issuer_name,aladdin_id,security_description,portfolio_full_name,portfolio_id,str_001_s,str_002_ec,str_003b_ec,str_003_ec,str_004_asec,...,str_007_sect,str_008_sec,str_009_tec,gp_esccp_22,gp_esccp_25,gp_esccp_30,gp_essccp,scs_001_sec,scs_002_ec,scs_003_sec
0,SNT-WORLD,,,,,,,,,,...,,,,,,,,,,
1,11 BIT STUDIOS SA,F79892,11 BIT STUDIOS SA,Santander Prestiz Technologii i Innowacji (San...,PLSFIO0319,Ok,Ok,Ok,Ok,Ok,...,OK,,,OK,Ok,Ok,OK,OK,OK,OK
2,2I RETE GAS SPA,G70839,2I RETE GAS SPA,02.018.59973438020.0,PFC00659,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
3,2I RETE GAS SPA,G70839,2I RETE GAS SPA,ALDROVI S.L.,CPE05455,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
4,2I RETE GAS SPA,G70839,2I RETE GAS SPA,02.018.62238191020.0,PFC00665,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76087,ZURICH INSURANCE GROUP AG,B00344,ZURICH INSURANCE GROUP AG,"SANTANDER SOSTENIBLE ACCIONES, FI",FIG05273,Ok,Ok,Ok,Ok,Ok,...,OK,,,OK,Ok,Ok,OK,OK,OK,OK
76088,ZURICH INSURANCE GROUP AG,B00344,ZURICH INSURANCE GROUP AG,"MI PROYECTO SANTANDER ASG 2025 PENSIONES, F.P",FPG00028,Ok,Ok,Ok,Ok,Ok,...,OK,,,OK,Ok,Ok,OK,OK,OK,OK
76089,,,,,,,,,,,...,,,,,,,,,,
76090,Confidential - For Internal Use Only. Generate...,,,,,,,,,,...,,,,,,,,,,


In [19]:
delta.head()

Unnamed: 0,permid,isin,issuer_name,str_001_s,str_002_ec,str_003_ec,str_004_asec,str_005_ec,cs_001_sec,gp_esccp,...,str_007_sect,gp_esccp_22,gp_esccp_25,gp_esccp_30,art_8_basicos,str_003b_ec,new_exclusion,exclusion_list,new_inclusion,inclusion_list
0,4295863735,CND100054N18,Xiangcai Co Ltd,,,,,,,,...,,,,,,,False,[],True,[cs_003_sec]
1,5000551275,CND10005MTB1,Ningbo Construction Co Ltd,,,,,,,,...,,,,,,,True,[cs_003_sec],False,[]
2,4297957993,CA48213Y1079,Jushi Holdings Inc,OK,,,,,,,...,,,,,,,False,[],True,"[str_001_s, str_004_asec, cs_001_sec, cs_003_s..."
3,4297861403,KR6067571B83,NVH Korea Inc,OK,,,,,,,...,,,,,,,False,[],True,"[str_001_s, str_004_asec, cs_001_sec, cs_003_s..."
4,5000063368,KR6013871CB3,GMB Korea Corp,OK,,,,,,,...,,,,,,,False,[],True,"[str_001_s, str_004_asec, cs_001_sec, cs_003_s..."


In [22]:
delta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6753 entries, 0 to 6752
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   permid          6753 non-null   int64 
 1   isin            6753 non-null   object
 2   issuer_name     6753 non-null   object
 3   str_001_s       2072 non-null   object
 4   str_002_ec      929 non-null    object
 5   str_003_ec      725 non-null    object
 6   str_004_asec    2496 non-null   object
 7   str_005_ec      1074 non-null   object
 8   cs_001_sec      2326 non-null   object
 9   gp_esccp        9 non-null      object
 10  cs_003_sec      2179 non-null   object
 11  cs_002_ec       666 non-null    object
 12  str_006_sec     2072 non-null   object
 13  str_007_sect    2497 non-null   object
 14  gp_esccp_22     11 non-null     object
 15  gp_esccp_25     8 non-null      object
 16  gp_esccp_30     12 non-null     object
 17  art_8_basicos   522 non-null    object
 18  str_003b

In [20]:
# add aladdin_id to delta 
logger.info("Adding aladdin_id to delta")
delta = delta.merge(crosreference[["permid", "aladdin_id"]], on="permid", how="left")

ValueError: You are trying to merge on int64 and object columns for key 'permid'. If you wish to proceed you should use pd.concat

In [None]:
# check delta and brs_carteras columns in common
common_cols = delta.columns.intersection(brs_carteras.columns)
common_cols

In [28]:
merge_col = ['str_001_s', 'str_002_ec', 'str_003_ec', 'str_004_asec',
       'str_005_ec', 'str_006_sec', 'str_007_sect', 'gp_esccp_22',
       'gp_esccp_25', 'gp_esccp_30', 'str_003b_ec', 'aladdin_id']
# add str columns from brs
logger.info("Adding str columns from brs_carteras to delta")
delta = delta.merge(brs_carteras[merge_col], on="aladdin_id", how="left", suffixes=("", "_brs"))

In [None]:
delta.head()
stop

In [None]:
# SAVE RESULTS
output_file = "delta_results_beta.csv"
delta.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info("Analysis completed successfully.")
