In [1]:
import sys
import os
import warnings
from pathlib import Path
from typing import List, Tuple
from itertools import chain
from collections import defaultdict

import numpy as np
import pandas as pd

from utils.dataloaders import (
    load_clarity_data,
    load_aladdin_data,
    load_crossreference,
    load_portfolios,
    load_overrides,
    save_excel
)
from utils.zombie_killer import main as zombie_killer

2025-03-27 20:07:24,259 - utils.get_date - INFO - Date format is valid. Date set to 202504.
Output directory for script zombie-killer is set to: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\zombie_list


In [2]:
# Import the centralized configuration
from config import get_config

# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config("pre-ovr-analysis", interactive=False)
logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]

# Use the paths from config
df_1_path = paths["PRE_DF_WOVR_PATH"]
df_2_path = paths["CURRENT_DF_WOUTOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]
COMMITTEE_PATH = paths["COMMITTEE_PATH"]

# Define the output directory and file based on the configuration.
OUTPUT_DIR = config["OUTPUT_DIR"]
OUTPUT_FILE = OUTPUT_DIR / f"{DATE}_pre_ovr_analysis.xlsx"

2025-03-27 20:07:28,237 - utils.get_date - INFO - Date format is valid. Date set to 202504.
Output directory for script pre-ovr-analysis is set to: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\pre-ovr-analysis


In [3]:
# Ignore workbook warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [4]:
# check that the date constants are set correctly
print(f"{DATE} and {YEAR} and {DATE_PREV}.")

202504 and 2025 and 202503.


In [5]:
# DEFINE TEST COLUMNS
# let's define necessary column lists

clarity_test_col = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "art_8_basicos",
    "str_006_sec",
    "cs_001_sec",
    "cs_002_ec",
]
columns_to_read = ["permid", "isin", "issuer_name"] + clarity_test_col
brs_test_cols = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "str_006_sec",
    "str_sfdr8_aec",
    "scs_001_sec",
    "scs_002_ec",
    "aladdin_id",
]
rename_dict = {
    "cs_001_sec": "scs_001_sec",
    "cs_002_ec": "scs_002_ec",
    "art_8_basicos": "str_sfdr8_aec",
}

delta_test_cols = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "str_006_sec",
    "str_sfdr8_aec",
    "scs_001_sec",
    "scs_002_ec",
]


Define Functions

In [6]:
def prepare_dataframes(
    base_df: pd.DataFrame, new_df: pd.DataFrame, target_index:str = "permid"
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Prepare DataFrames by setting the index and filtering for common indexes.
    Logs info about common, new, and missing indexes.
    """
    # Set index to 'permid' if it exists, otherwise assume it's already the index.
    logger.info(f"Setting index to {target_index}.")
    if target_index in base_df.columns:
        base_df = base_df.set_index(target_index)
    else:
        logger.warning("df1 does not contain a 'permid' column. Using current index.")

    if target_index in new_df.columns:
        new_df = new_df.set_index(target_index)
    else:
        logger.warning("df2 does not contain a 'permid' column. Using current index.")

    common_indexes = base_df.index.intersection(new_df.index)
    new_indexes = new_df.index.difference(base_df.index)
    missing_indexes = base_df.index.difference(new_df.index)

    logger.info(f"Number of common indexes: {len(common_indexes)}")

    return (
        base_df.loc[common_indexes],
        new_df.loc[common_indexes],
        new_df.loc[new_indexes],
        base_df.loc[missing_indexes],
    )

In [35]:
def compare_dataframes(
    df1: pd.DataFrame, df2: pd.DataFrame, test_col: List[str] = delta_test_cols
) -> pd.DataFrame:
    """Compare DataFrames and create a delta DataFrame."""
    delta = df2.copy()
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Comparing column: {col}")
            # Create a mask for differences between the two DataFrames
            diff_mask = df1[col] != df2[col]
            # Update the delta DataFrame with the differences
            delta.loc[~diff_mask, col] = np.nan
    return delta


def get_exclusion_list(
    row: pd.Series,
    df1: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
) -> List[str]:
    """Get list of columns that changed to EXCLUDED."""
    return [
        col
        for col in test_col
        if row[col] == "EXCLUDED" and df1.loc[row.name, col] != "EXCLUDED"
    ]


def get_inclusion_list(
    row: pd.Series,
    df1: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
) -> List[str]:
    """Get list of columns that changed from EXCLUDED to any other value."""
    return [
        col
        for col in test_col
        if row[col] != "EXCLUDED" and df1.loc[row.name, col] == "EXCLUDED"
    ]


def check_new_exclusions(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    delta: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
    suffix_level: str = "",
) -> pd.DataFrame:
    """Check for new exclusions and update the delta DataFrame."""
    delta["new_exclusion"] = False
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Checking for new exclusions in column: {col}")
            mask = (df1[col] != "EXCLUDED") & (df2[col] == "EXCLUDED")
            delta.loc[mask, "new_exclusion"] = True
            logger.info(f"Number of new exclusions in {col}: {mask.sum()}")
    delta[f"exclusion_list{suffix_level}"] = delta.apply(
        lambda row: get_exclusion_list(row, df1, test_col), axis=1
    )
    return delta


def check_new_inclusions(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    delta: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
    suffix_level: str = "",
) -> pd.DataFrame:
    """Check for new inclusions and update the delta DataFrame."""
    delta["new_inclusion"] = False
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Checking for new inclusions in column: {col}")
            mask = (df1[col] == "EXCLUDED") & (df2[col] != "EXCLUDED")
            delta.loc[mask, "new_inclusion"] = True
            logger.info(f"Number of new inclusions in {col}: {mask.sum()}")
    delta[f"inclusion_list{suffix_level}"] = delta.apply(
        lambda row: get_inclusion_list(row, df1, test_col), axis=1
    )
    return delta


def finalize_delta(
    delta: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
    target_index: str = "permid",
) -> pd.DataFrame:
    """Finalize the delta DataFrame by removing unchanged rows and resetting the index."""
    delta = delta.dropna(subset=test_col, how="all")
    delta.reset_index(inplace=True)
    delta[target_index] = delta[target_index].astype(str)
    logger.info(f"Final delta shape: {delta.shape}")
    return delta

def override_dict(
        df:pd.DataFrame=None,
        id_col:str="aladdin_id",
        str_col:str="ovr_target",
        ovr_col:str="ovr_value",
        ):
    """
    Converts the overrides DataFrame to a dictionary.
    Args:
        df (pd.DataFrame): DataFrame containing the overrides.
        id_col (str): Column name for the identifier.
        str_col (str): Column name for the strategy.
        ovr_col (str): Column name for the override value.
    Returns:
        dict: Dictionary of overrides.
    """
    # 1. Groupd the df by issuer_id
    grouped = df.groupby(id_col)

    # 2. Initialise the dictionary
    ovr_dict = {}

    # 3. Iterate over each group (issuer id and its corresponding rows)
    for id, group_data in grouped:
        # 3.1. for each issuer id create a dict pairing the strategy and the override value
        ovr_result = dict(zip(group_data[str_col], group_data[ovr_col]))
        # 3.2. add the dict to the main dict
        ovr_dict[id] = ovr_result
    
    return ovr_dict

# define a function to add portfolio OR benchmark info to the delta_df
def add_portfolio_benchmark_info_to_df(
    portfolio_dict, delta_df, column_name="affected_portfolio_str"
):

    # Initialize a defaultdict to accumulate (portfolio_id, strategy_name) pairs
    aladdin_to_info = defaultdict(list)

    for portfolio_id, data in portfolio_dict.items():
        strategy = data.get("strategy_name")
        for a_id in data.get("aladdin_id", []):
            aladdin_to_info[a_id].append((portfolio_id, strategy))

    # Map each aladdin_id in delta_df to a list of accumulated portfolio info
    delta_df[column_name] = delta_df["aladdin_id"].apply(
        lambda x: list(chain.from_iterable(aladdin_to_info.get(x, [])))
    )

    return delta_df


def get_issuer_level_df(df: pd.DataFrame, idx_name: str) -> pd.DataFrame:
    """
    Removes duplicates based on idx_name, and drops rows where idx_name column contains
    NaN, None, or strings like "nan", "NaN", "none", or empty strings.

    Args:
        df (pd.DataFrame): Input dataframe.
        idx_name (str): Column name used for duplicate removal and NaN filtering.

    Returns:
        pd.DataFrame: Cleaned dataframe.
    """
    # Drop duplicates
    df_cleaned = df.drop_duplicates(subset=[idx_name])

    # Drop rows where idx_name is NaN/None or has invalid strings
    valid_rows = df_cleaned[idx_name].notnull() & (
        ~df_cleaned[idx_name]
        .astype(str)
        .str.strip()
        .str.lower()
        .isin(["nan", "none", ""])
    )

    return df_cleaned[valid_rows]


def filter_non_empty_lists(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Returns a DataFrame filtered so that rows where the specified column contains
    an empty list are removed. Keeps rows where the column has a list with at least one element.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame
    - column (str): The name of the column to check
    
    Returns:
    - pd.DataFrame: Filtered DataFrame
    """
    return df[df[column].apply(lambda x: isinstance(x, list) and len(x) > 0)]

def filter_rows_with_common_elements(df, col1, col2):
    """
    Return rows of df where the lists in col1 and col2 have at least one common element.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        col1 (str): The name of the first column containing lists.
        col2 (str): The name of the second column containing lists.

    Returns:
        pd.DataFrame: A DataFrame filtered to include only rows where col1 and col2 have a common element.
    """
    logger.info(f"Filtering rows with common elements in columns: {col1} and {col2}")
    mask = df.apply(lambda row: bool(set(row[col1]).intersection(row[col2])), axis=1)
    return df[mask].copy()


In [8]:
# LOAD DATA
# clarity data
df_1 = load_clarity_data(df_1_path, columns_to_read)
df_2 = load_clarity_data(df_2_path, columns_to_read)

2025-03-27 20:07:28,328 - utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\202503_df_issuer_level_with_ovr.csv
2025-03-27 20:07:28,784 - utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\202503_df_issuer_level_with_ovr.csv
2025-03-27 20:07:28,786 - utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250401_Equities_feed_IssuerLevel_sinOVR.csv
2025-03-27 20:07:29,297 - utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250401_Equities_feed_IssuerLevel_sinOVR.csv


In [9]:
# let's rename columns in df_1 and df_2 using the rename_dict
df_1.rename(columns=rename_dict, inplace=True)
df_2.rename(columns=rename_dict, inplace=True)


In [10]:
# aladdin /brs data / perimetros
brs_carteras = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_carteras")    
brs_benchmarks = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_benchmarks")
crosreference = load_crossreference(CROSSREFERENCE_PATH)

2025-03-27 20:07:29,324 - utils.dataloaders - INFO - Loading portfolio_carteras data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202504_strategies_snt world_portf_bmks.xlsx
2025-03-27 20:07:43,081 - utils.dataloaders - INFO - Cleaning columns and converting data types for portfolio_carteras
2025-03-27 20:07:43,083 - utils.dataloaders - INFO - Converting column 'aladdin_id' to string.
2025-03-27 20:07:43,089 - utils.dataloaders - INFO - Converting column 'portfolio_id' to string.
2025-03-27 20:07:43,091 - utils.dataloaders - INFO - Successfully loaded Aladdin data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202504_strategies_snt world_portf_bmks.xlsx
2025-03-27 20:07:43,092 - utils.dataloaders - INFO - Loading portfolio_benchmarks data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202504_strategies_snt world_portf_bmks.

In [11]:
# add aladdin_id to df_1 and df_2
logger.info("Adding aladdin_id to clarity dfs")
df_1 = df_1.merge(crosreference[["permid", "aladdin_id"]], on="permid", how="left")
df_2 = df_2.merge(crosreference[["permid", "aladdin_id"]], on="permid", how="left")

2025-03-27 20:07:55,261 - pre-ovr-analysis - INFO - Adding aladdin_id to clarity dfs


In [12]:
# get BRS data at issuer level for becnhmarks without empty aladdin_id
brs_carteras_issuerlevel = get_issuer_level_df(brs_carteras, "aladdin_id")

In [13]:
# get BRS data at issuer level for becnhmarks without empty aladdin_id
brs_benchmarks_issuerlevel = get_issuer_level_df(brs_benchmarks, "aladdin_id")

In [14]:
# sri/ESG Team data
overrides = load_overrides(OVR_PATH)
# rename column brs_id to aladdin_id
overrides.rename(columns={"brs_id": "aladdin_id"}, inplace=True)

loading overrides columns ['clarityid', 'permid', 'brs_id', 'ovr_target', 'ovr_value']
2025-03-27 20:07:55,462 - utils.dataloaders - INFO - Loading overrides from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\overrides\overrides_db.xlsx


In [15]:
ovr_dict = override_dict(overrides)

In [16]:
# Load portfolios & benchmarks dicts
(
    portfolio_dict,
    benchmark_dict,
) = load_portfolios(path_pb=BMK_PORTF_STR_PATH, path_committe=COMMITTEE_PATH)

2025-03-27 20:07:56,631 - utils.dataloaders - INFO - Loading portfolios from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202504_strategies_snt world_portf_bmks.xlsx
2025-03-27 20:08:13,435 - utils.dataloaders - INFO - Loading benchmarks from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202504_strategies_snt world_portf_bmks.xlsx
2025-03-27 20:08:23,900 - utils.dataloaders - INFO - Loading strategy data for portfolios from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\portfolios_committees\portfolio_lists.xlsx
2025-03-27 20:08:23,931 - utils.dataloaders - INFO - Loading strategy data for benchmarks from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\portfolios_committees\portfolio_lists.xlsx
2025-03-27 20:08:23,946 - utils.dataloaders - INFO - Benchmark ID 'FIFSANEUR' appears in multiple strategies
2025-03-27 20:08:23,9

In [17]:
for i, (k,v) in enumerate(portfolio_dict.items()):
    if i == 10:
        break
    print(k, v)

ADM0PSA1 {'aladdin_id': ['J66430', 'R57958', 'R63005', 'R63005', 'F95505', 'F95505', 'B95488', 'I20488', 'I20488', 'I20488', 'R49188', '052591', 'R83199', '007699', 'R63609', 'H44516', '077779', '05564X', 'N15397', '128003', '128003', 'D43041', '191216', 'C96217', 'C70614', 'R50818', 'R57820', 'R60098', 'R60098', 'D34426', '36827E', '36827E', 'R51121', '24820T', 'F60954', 'C05702', 'C05702', 'D85398', 'FR0570', 'FR0570', 'FR0570', 'FR0570', 'R62757', 'R62757', 'R62757', 'K23944', '38142U', '38142U', 'F92008', 'R59316', 'C70514', 'R71927', 'C05671', 'R63519', 'R76481', 'R76481', 'R62699', 'R62699', '459200', '616880', 'R57939', 'R57939', 'R62038', 'R62038', 'R62038', 'C97848', 'R63600', 'R68028', '580135', '580135', 'G97316', 'D90986', 'R96795', 'MIZ00A', 'R55597', 'R65300', 'R68589', 'R57898', '128009', '128009', '713448', '713448', 'R57931', '742718', 'F13060', '760325', 'I40729', 'R65375', 'C74380', 'C74380', 'R21880', 'R21880', 'R57942', '830505', 'R49692', 'R49692', 'B89867', 'B898

In [18]:
common_cols = set(df_1.columns) & set(df_2.columns) & set(brs_carteras_issuerlevel.columns)
common_cols = sorted(list(common_cols))

START PRE-OVR ANALISIS

In [19]:
# PREPARE DATA CLARITY LEVEL
(
    df_1, 
    df_2,
    new_issuers_clarity,
    out_issuer_clarity,
) = prepare_dataframes(df_1, df_2)

# log size of new and missing issuers
logger.info(f"Number of new issuers: {new_issuers_clarity.shape[0]}")
logger.info(f"Number of missing issuers: {out_issuer_clarity.shape[0]}")

2025-03-27 20:08:24,144 - pre-ovr-analysis - INFO - Setting index to permid.
2025-03-27 20:08:24,265 - pre-ovr-analysis - INFO - Number of common indexes: 69222
2025-03-27 20:08:24,326 - pre-ovr-analysis - INFO - Number of new issuers: 106
2025-03-27 20:08:24,328 - pre-ovr-analysis - INFO - Number of missing issuers: 56


In [20]:
# reset index for new_issuers_clarity and out_issuer_clarity
new_issuers_clarity.reset_index(inplace=True)
out_issuer_clarity.reset_index(inplace=True)
# drop isin from out_issuer_clarity and new_issuers_clarity
out_issuer_clarity.drop(columns=["isin"], inplace=True)
new_issuers_clarity.drop(columns=["isin"], inplace=True)

# remember to remove empyt empyt aladin id
new_issuers_clarity.head()

Unnamed: 0,permid,issuer_name,str_001_s,str_002_ec,str_003_ec,str_004_asec,str_005_ec,scs_001_sec,scs_002_ec,str_006_sec,str_sfdr8_aec,str_003b_ec,aladdin_id
0,4295857675,Excelsior Capital Ltd,OK,OK,OK,EXCLUDED,OK,EXCLUDED,OK,EXCLUDED,OK,OK,D73574
1,4295858270,Poseidon Nickel Ltd,OK,OK,OK,EXCLUDED,OK,EXCLUDED,OK,EXCLUDED,OK,OK,D70698
2,4295858807,Silver Lake Deflector Pty Ltd,OK,OK,OK,EXCLUDED,OK,EXCLUDED,OK,OK,OK,OK,
3,4295859680,Azevedo & Travassos SA,OK,OK,OK,EXCLUDED,OK,OK,OK,OK,OK,OK,F80754
4,4295860216,Dimed SA Distribuidora de Medicamentos,OK,OK,OK,OK,OK,OK,OK,OK,OK,OK,F85102


In [21]:
# PREPARE DATA CARTERAS BRS LEVEL
(
    brs_df, 
    clarity_df,
    in_clarity_but_not_in_brs,
    in_brs_but_not_in_clarity,
) = prepare_dataframes(brs_carteras_issuerlevel, df_2, target_index="aladdin_id")

# log size of new and missing issuers
logger.info(f"Number issuers in clarity but not Aladdin: {in_clarity_but_not_in_brs.shape[0]}")
logger.info(f"Number issuers in Aladdin but not Clarity: {in_brs_but_not_in_clarity.shape[0]}")

2025-03-27 20:08:24,367 - pre-ovr-analysis - INFO - Setting index to aladdin_id.
2025-03-27 20:08:24,509 - pre-ovr-analysis - INFO - Number of common indexes: 2552
2025-03-27 20:08:24,584 - pre-ovr-analysis - INFO - Number issuers in clarity but not Aladdin: 66670
2025-03-27 20:08:24,586 - pre-ovr-analysis - INFO - Number issuers in Aladdin but not Clarity: 1303


In [22]:
# PREPARE DATA BENCHMARK BRS LEVEL
(
    brs_df_benchmarks, 
    clarity_df_benchmarks,
    in_clarity_but_not_in_brs_benchmarks,
    in_brs_benchmark_but_not_in_clarity,
) = prepare_dataframes(brs_benchmarks_issuerlevel, df_2, target_index="aladdin_id")

# log size of new and missing issuers
logger.info(f"Number issuers in clarity but not benchmarks: {in_clarity_but_not_in_brs_benchmarks.shape[0]}")
logger.info(f"Number issuers in benchmarks but not Clarity: {in_brs_benchmark_but_not_in_clarity.shape[0]}")

2025-03-27 20:08:24,598 - pre-ovr-analysis - INFO - Setting index to aladdin_id.
2025-03-27 20:08:24,746 - pre-ovr-analysis - INFO - Number of common indexes: 2552
2025-03-27 20:08:24,855 - pre-ovr-analysis - INFO - Number issuers in clarity but not benchmarks: 66670
2025-03-27 20:08:24,857 - pre-ovr-analysis - INFO - Number issuers in benchmarks but not Clarity: 1303


In [23]:
# COMPARE DATA
logger.info("comparing clarity dataframes")
delta_clarity = compare_dataframes(df_1, df_2)
delta_clarity = check_new_exclusions(df_1, df_2, delta_clarity)
delta_clarity = check_new_inclusions(df_1, df_2, delta_clarity)
delta_clarity = finalize_delta(delta_clarity)
logger.info("checking impact compared to BRS portfolio data")
delta_brs = compare_dataframes(brs_df, clarity_df)
delta_brs = check_new_exclusions(brs_df, clarity_df, delta_brs, suffix_level="_brs")
delta_brs = check_new_inclusions(brs_df, clarity_df, delta_brs, suffix_level="_brs")
delta_brs = finalize_delta(delta_brs, target_index="aladdin_id")
logger.info("checking impact compared to BRS benchmarks data")
delta_benchmarks = compare_dataframes(brs_df_benchmarks, clarity_df_benchmarks)
delta_benchmarks = check_new_exclusions(brs_df_benchmarks, clarity_df_benchmarks, delta_benchmarks, suffix_level="_brs")
delta_benchmarks = check_new_inclusions(brs_df_benchmarks, clarity_df_benchmarks, delta_benchmarks, suffix_level="_brs")
delta_benchmarks = finalize_delta(delta_benchmarks, target_index="aladdin_id")


2025-03-27 20:08:24,871 - pre-ovr-analysis - INFO - comparing clarity dataframes
2025-03-27 20:08:24,901 - pre-ovr-analysis - INFO - Comparing column: str_001_s
2025-03-27 20:08:24,920 - pre-ovr-analysis - INFO - Comparing column: str_002_ec
2025-03-27 20:08:24,940 - pre-ovr-analysis - INFO - Comparing column: str_003_ec
2025-03-27 20:08:24,955 - pre-ovr-analysis - INFO - Comparing column: str_003b_ec
2025-03-27 20:08:24,972 - pre-ovr-analysis - INFO - Comparing column: str_004_asec
2025-03-27 20:08:24,986 - pre-ovr-analysis - INFO - Comparing column: str_005_ec
2025-03-27 20:08:24,999 - pre-ovr-analysis - INFO - Comparing column: str_006_sec
2025-03-27 20:08:25,018 - pre-ovr-analysis - INFO - Comparing column: str_sfdr8_aec
2025-03-27 20:08:25,031 - pre-ovr-analysis - INFO - Comparing column: scs_001_sec
2025-03-27 20:08:25,043 - pre-ovr-analysis - INFO - Comparing column: scs_002_ec
2025-03-27 20:08:25,054 - pre-ovr-analysis - INFO - Checking for new exclusions in column: str_001_s
2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  delta[target_index] = delta[target_index].astype(str)


2025-03-27 20:08:31,278 - pre-ovr-analysis - INFO - Final delta shape: (2552, 17)
2025-03-27 20:08:31,281 - pre-ovr-analysis - INFO - checking impact compared to BRS benchmarks data
2025-03-27 20:08:31,283 - pre-ovr-analysis - INFO - Comparing column: str_001_s
2025-03-27 20:08:31,285 - pre-ovr-analysis - INFO - Comparing column: str_002_ec
2025-03-27 20:08:31,287 - pre-ovr-analysis - INFO - Comparing column: str_003_ec
2025-03-27 20:08:31,289 - pre-ovr-analysis - INFO - Comparing column: str_003b_ec
2025-03-27 20:08:31,291 - pre-ovr-analysis - INFO - Comparing column: str_004_asec
2025-03-27 20:08:31,292 - pre-ovr-analysis - INFO - Comparing column: str_005_ec
2025-03-27 20:08:31,297 - pre-ovr-analysis - INFO - Comparing column: str_006_sec
2025-03-27 20:08:31,300 - pre-ovr-analysis - INFO - Comparing column: str_sfdr8_aec
2025-03-27 20:08:31,305 - pre-ovr-analysis - INFO - Comparing column: scs_001_sec
2025-03-27 20:08:31,308 - pre-ovr-analysis - INFO - Comparing column: scs_002_ec
2

In [25]:
#from utils.zombie_killer import main as zombie_killer
#logger.info("Getting zombie analysis df")
#zombie_df = zombie_killer()

In [26]:
# PREP DELTAS BEFORE SAVING
logger.info("Preparing deltas before saving")
# use crossreference to add permid to delta_brs
delta_brs = delta_brs.merge(crosreference[["aladdin_id", "permid"]], on="aladdin_id", how="left")
delta_benchmarks = delta_benchmarks.merge(crosreference[["aladdin_id", "permid"]], on="aladdin_id", how="left")
# drop isin from deltas
delta_clarity.drop(columns=["isin"], inplace=True)
delta_brs.drop(columns=["isin"], inplace=True)
delta_benchmarks.drop(columns=["isin"], inplace=True)
# add new column to delta_brs with ovr_dict value using aladdin_id
delta_brs["ovr_list"] = delta_brs["aladdin_id"].map(ovr_dict)
delta_clarity["ovr_list"] = delta_clarity["aladdin_id"].map(ovr_dict)
delta_benchmarks["ovr_list"] = delta_benchmarks["aladdin_id"].map(ovr_dict)
# let's add portfolio info to the delta_df
delta_clarity = add_portfolio_benchmark_info_to_df(portfolio_dict, delta_clarity)
delta_brs = add_portfolio_benchmark_info_to_df(portfolio_dict, delta_brs)
delta_benchmarks = add_portfolio_benchmark_info_to_df(portfolio_dict, delta_benchmarks)
# let's add benchmark info to the delta_df
delta_clarity = add_portfolio_benchmark_info_to_df(benchmark_dict, delta_clarity, "affected_benchmark_str")
delta_brs = add_portfolio_benchmark_info_to_df(benchmark_dict, delta_brs, "affected_benchmark_str")
delta_benchmarks = add_portfolio_benchmark_info_to_df(benchmark_dict, delta_benchmarks, "affected_benchmark_str")



2025-03-27 20:10:59,942 - pre-ovr-analysis - INFO - Preparing deltas before saving


In [36]:
# let's use filter_non_empty_lists to remove rows with empty lists in affected_portfolio_str
delta_brs = filter_non_empty_lists(delta_brs, "affected_portfolio_str")
# let's use filter_non_empty_lists to remove rows with empty lists in affected_portfolio_str
delta_benchmarks = filter_non_empty_lists(delta_benchmarks, "affected_portfolio_str")
# pass filter_rows_with_common_elements for columns exclusion_list_brs and affected_portfolio_str
delta_brs = filter_rows_with_common_elements(delta_brs, "exclusion_list_brs", "affected_portfolio_str")
delta_benchmarks = filter_rows_with_common_elements(delta_benchmarks, "exclusion_list_brs", "affected_portfolio_str")

2025-03-27 20:23:39,164 - pre-ovr-analysis - INFO - Filtering rows with common elements in columns: exclusion_list_brs and affected_portfolio_str
2025-03-27 20:23:39,173 - pre-ovr-analysis - INFO - Filtering rows with common elements in columns: exclusion_list_brs and affected_portfolio_str


In [30]:
delta_benchmarks.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2552 entries, 0 to 2551
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   aladdin_id              2552 non-null   object
 1   issuer_name             2552 non-null   object
 2   str_001_s               2552 non-null   object
 3   str_002_ec              2552 non-null   object
 4   str_003_ec              2552 non-null   object
 5   str_004_asec            2552 non-null   object
 6   str_005_ec              392 non-null    object
 7   scs_001_sec             519 non-null    object
 8   scs_002_ec              376 non-null    object
 9   str_006_sec             500 non-null    object
 10  str_sfdr8_aec           2552 non-null   object
 11  str_003b_ec             2552 non-null   object
 12  new_exclusion           2552 non-null   bool  
 13  exclusion_list_brs      2552 non-null   object
 14  new_inclusion           2552 non-null   bool  
 15  incl

In [28]:
from datetime import datetime
# define a function to save results in an Excel file
def save_excel(df_dict: dict, output_dir: Path, file_name: str) -> Path:
    """
    Writes multiple DataFrames to an Excel file with each DataFrame in a separate sheet.

    Parameters:
    - df_dict (dict): A dictionary where keys are sheet names and values are DataFrames.
    - output_dir (Path): The directory where the Excel file will be saved.
    - file_name (str): The base name for the Excel file.

    Returns:
    - Path: The full path to the saved Excel file.
    """
    # Create a date string in "YYYYMMDD" format
    date_str = datetime.now().strftime("%Y%m%d")

    # Ensure the output directory exists
    logger.info("Creating output directory: %s", output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Construct the full output file path (e.g., file_name_YYYYMMDD.xlsx)
    output_file = output_dir / f"{date_str}_{file_name}.xlsx"

    # Write each DataFrame to its own sheet with index set to False
    with pd.ExcelWriter(output_file) as writer:
        logger.info("Writing DataFrames to Excel file: %s", output_file)
        for sheet_name, df in df_dict.items():
            logger.info("Writing sheet: %s", sheet_name)
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    logger.info("Results saved to Excel file: %s", output_file)

In [37]:
# MAKE SURE ISSUER HAS ALL THE PORTFOLIOS IN THEIR LIST NOT JUST THE LAST ONE

# create dict of df and df name
dfs_dict = {
    #"zombie_analysis": zombie_df,
    "delta_carteras": delta_brs,
    "delta_benchmarks": delta_benchmarks,
    "delta_clarity": delta_clarity,
    "new_issuers_clarity": new_issuers_clarity,
    "out_issuer_clarity": out_issuer_clarity,
}
# save to excel
save_excel(dfs_dict, OUTPUT_DIR, file_name="pre_ovr_analysis_betathree")


2025-03-27 20:23:52,380 - pre-ovr-analysis - INFO - Creating output directory: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\pre-ovr-analysis
2025-03-27 20:23:52,386 - pre-ovr-analysis - INFO - Writing DataFrames to Excel file: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\pre-ovr-analysis\20250327_pre_ovr_analysis_betathree.xlsx
2025-03-27 20:23:52,390 - pre-ovr-analysis - INFO - Writing sheet: delta_carteras
2025-03-27 20:23:52,433 - pre-ovr-analysis - INFO - Writing sheet: delta_benchmarks
2025-03-27 20:23:52,494 - pre-ovr-analysis - INFO - Writing sheet: delta_clarity
2025-03-27 20:23:54,480 - pre-ovr-analysis - INFO - Writing sheet: new_issuers_clarity
2025-03-27 20:23:54,522 - pre-ovr-analysis - INFO - Writing sheet: out_issuer_clarity
2025-03-27 20:23:54,942 - pre-ovr-analysis - INFO - Results saved to Excel file: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\pre-ovr-analysis\20250327