In [1]:
import sys
import warnings
from pathlib import Path
from typing import List, Tuple
from itertools import chain
from collections import defaultdict

# Get the parent directory of current notebook dir, which is the repo root
repo_root = Path.cwd().parent
sys.path.insert(0, str(repo_root))

# Verify the path is correct:
print(f"Added to sys.path: {repo_root}")

import numpy as np
import pandas as pd

from scripts.utils.dataloaders import (
    load_clarity_data,
    load_aladdin_data,
    load_crossreference,
    load_portfolios,
    load_overrides,
    save_excel
)
from scripts.utils.zombie_killer import main as zombie_killer

# Import the centralized configuration
from scripts.utils.config import get_config

Added to sys.path: c:\Users\n740789\Documents\clarity_data_quality_controls
2025-04-25 20:59:00,470 - utils.get_date - INFO - Date format is valid. Date set to 202505.


In [4]:
# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config("pre-ovr-analysis", interactive=False, auto_date=False, fixed_date="202505")
logger = config["logger"]
DATE = config["DATE"]
print(f"{DATE}, {type(DATE)}")
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]

# Use the paths from config
df_1_path = paths["PRE_DF_WOVR_PATH"]
df_2_path = paths["CURRENT_DF_WOUTOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]
COMMITTEE_PATH = paths["COMMITTEE_PATH"]

# Define the output directory and file based on the configuration.
#OUTPUT_DIR = config["OUTPUT_DIR"]
#OUTPUT_FILE = OUTPUT_DIR / f"{DATE}_pre_ovr_analysis.xlsx"

202505, <class 'str'>


In [3]:
# Ignore workbook warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [None]:
# check that the date constants are set correctly
print(f"{DATE} and {YEAR} and {DATE_PREV}.")

In [5]:
# DEFINE CONSTANTS & TEST COLUMNS & DICTS
# let's define necessary column lists

id_name_cols = ["permid", "isin", "issuer_name"]
id_name_issuers_cols = ["aladdin_id", "permid", "issuer_name"]
clarity_test_col = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "art_8_basicos",
    "str_006_sec",
    "cs_001_sec",
    "cs_002_ec",
]
columns_to_read = id_name_cols + clarity_test_col
delta_test_cols = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "str_006_sec",
    "str_sfdr8_aec",
    "scs_001_sec",
    "scs_002_ec",
]

brs_test_cols = ["aladdin_id"] + delta_test_cols
rename_dict = {
    "cs_001_sec": "scs_001_sec",
    "cs_002_ec": "scs_002_ec",
    "art_8_basicos": "str_sfdr8_aec",
}

Define Functions

In [6]:
def prepare_dataframes(
    base_df: pd.DataFrame, new_df: pd.DataFrame, target_index:str = "permid"
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Prepare DataFrames by setting the index and filtering for common indexes.
    Logs info about common, new, and missing indexes.
    """
    # Set index to 'permid' if it exists, otherwise assume it's already the index.
    logger.info(f"Setting index to {target_index}.")
    if target_index in base_df.columns:
        base_df = base_df.set_index(target_index)
    else:
        logger.warning("df1 does not contain a 'permid' column. Using current index.")

    if target_index in new_df.columns:
        new_df = new_df.set_index(target_index)
    else:
        logger.warning("df2 does not contain a 'permid' column. Using current index.")

    common_indexes = base_df.index.intersection(new_df.index)
    new_indexes = new_df.index.difference(base_df.index)
    missing_indexes = base_df.index.difference(new_df.index)

    logger.info(f"Number of common indexes: {len(common_indexes)}")

    return (
        base_df.loc[common_indexes],
        new_df.loc[common_indexes],
        new_df.loc[new_indexes],
        base_df.loc[missing_indexes],
    )

In [7]:
def compare_dataframes(
    df1: pd.DataFrame, df2: pd.DataFrame, test_col: List[str] = delta_test_cols
) -> pd.DataFrame:
    """Compare DataFrames and create a delta DataFrame."""
    delta = df2.copy()
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Comparing column: {col}")
            # Create a mask for differences between the two DataFrames
            diff_mask = df1[col] != df2[col]
            # Update the delta DataFrame with the differences
            delta.loc[~diff_mask, col] = np.nan
    return delta

def get_exclusion_list(
    row: pd.Series,
    df1: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
) -> List[str]:
    """Get list of columns that changed to EXCLUDED."""
    return [
        col
        for col in test_col
        if row[col] == "EXCLUDED" and df1.loc[row.name, col] != "EXCLUDED"
    ]

def get_inclusion_list(
    row: pd.Series,
    df1: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
) -> List[str]:
    """Get list of columns that changed from EXCLUDED to any other value."""
    return [
        col
        for col in test_col
        if row[col] != "EXCLUDED" and df1.loc[row.name, col] == "EXCLUDED"
    ]

def check_new_exclusions(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    delta: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
    suffix_level: str = "",
) -> pd.DataFrame:
    """Check for new exclusions and update the delta DataFrame."""
    delta["new_exclusion"] = False
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Checking for new exclusions in column: {col}")
            mask = (df1[col] != "EXCLUDED") & (df2[col] == "EXCLUDED")
            delta.loc[mask, "new_exclusion"] = True
            logger.info(f"Number of new exclusions in {col}: {mask.sum()}")
    delta[f"exclusion_list{suffix_level}"] = delta.apply(
        lambda row: get_exclusion_list(row, df1, test_col), axis=1
    )
    return delta

def check_new_inclusions(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    delta: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
    suffix_level: str = "",
) -> pd.DataFrame:
    """Check for new inclusions and update the delta DataFrame."""
    delta["new_inclusion"] = False
    for col in test_col:
        if col in df1.columns and col in df2.columns:
            logger.info(f"Checking for new inclusions in column: {col}")
            mask = (df1[col] == "EXCLUDED") & (df2[col] != "EXCLUDED")
            delta.loc[mask, "new_inclusion"] = True
            logger.info(f"Number of new inclusions in {col}: {mask.sum()}")
    delta[f"inclusion_list{suffix_level}"] = delta.apply(
        lambda row: get_inclusion_list(row, df1, test_col), axis=1
    )
    return delta

def finalize_delta(
    delta: pd.DataFrame,
    test_col: List[str] = delta_test_cols,
    target_index: str = "permid",
) -> pd.DataFrame:
    """Finalize the delta DataFrame by removing unchanged rows and resetting the index."""
    delta = delta.dropna(subset=test_col, how="all")
    delta.reset_index(inplace=True)
    delta[target_index] = delta[target_index].astype(str)
    logger.info(f"Final delta shape: {delta.shape}")
    return delta

def create_override_dict(
    df: pd.DataFrame = None,
    id_col: str = "aladdin_id",
    str_col: str = "ovr_target",
    ovr_col: str = "ovr_value",
):
    """
    Converts the overrides DataFrame to a dictionary.
    Args:
        df (pd.DataFrame): DataFrame containing the overrides.
        id_col (str): Column name for the identifier.
        str_col (str): Column name for the strategy.
        ovr_col (str): Column name for the override value.
    Returns:
        dict: Dictionary of overrides.
    """
    # 1. Groupd the df by issuer_id
    grouped = df.groupby(id_col)

    # 2. Initialise the dictionary
    ovr_dict = {}

    # 3. Iterate over each group (issuer id and its corresponding rows)
    for id, group_data in grouped:
        # 3.1. for each issuer id create a dict pairing the strategy and the override value
        ovr_result = dict(zip(group_data[str_col], group_data[ovr_col]))
        # 3.2. add the dict to the main dict
        ovr_dict[id] = ovr_result

    return ovr_dict

def add_portfolio_benchmark_info_to_df(
    portfolio_dict, delta_df, column_name="affected_portfolio_str"
):

    # Initialize a defaultdict to accumulate (portfolio_id, strategy_name) pairs
    aladdin_to_info = defaultdict(list)

    for portfolio_id, data in portfolio_dict.items():
        strategy = data.get("strategy_name")
        for a_id in data.get("aladdin_id", []):
            aladdin_to_info[a_id].append((portfolio_id, strategy))

    # Map each aladdin_id in delta_df to a list of accumulated portfolio info
    delta_df[column_name] = delta_df["aladdin_id"].apply(
        lambda x: list(chain.from_iterable(aladdin_to_info.get(x, [])))
    )

    return delta_df

def get_issuer_level_df(df: pd.DataFrame, idx_name: str) -> pd.DataFrame:
    """
    Removes duplicates based on idx_name, and drops rows where idx_name column contains
    NaN, None, or strings like "nan", "NaN", "none", or empty strings.

    Args:
        df (pd.DataFrame): Input dataframe.
        idx_name (str): Column name used for duplicate removal and NaN filtering.

    Returns:
        pd.DataFrame: Cleaned dataframe.
    """
    # Drop duplicates
    df_cleaned = df.drop_duplicates(subset=[idx_name])

    # Drop rows where idx_name is NaN/None or has invalid strings
    valid_rows = df_cleaned[idx_name].notnull() & (
        ~df_cleaned[idx_name]
        .astype(str)
        .str.strip()
        .str.lower()
        .isin(["nan", "none", ""])
    )

    return df_cleaned[valid_rows]

def filter_non_empty_lists(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Returns a DataFrame filtered so that rows where the specified column contains
    an empty list are removed. Keeps rows where the column has a list with at least one element.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame
    - column (str): The name of the column to check
    
    Returns:
    - pd.DataFrame: Filtered DataFrame
    """
    return df[df[column].apply(lambda x: isinstance(x, list) and len(x) > 0)]

def filter_rows_with_common_elements(df, col1, col2):
    """
    Return rows of df where the lists in col1 and col2 have at least one common element.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        col1 (str): The name of the first column containing lists.
        col2 (str): The name of the second column containing lists.

    Returns:
        pd.DataFrame: A DataFrame filtered to include only rows where col1 and col2 have a common element.
    """
    logger.info(f"Filtering rows with common elements in columns: {col1} and {col2}")
    mask = df.apply(lambda row: bool(set(row[col1]).intersection(row[col2])), axis=1)
    return df[mask].copy()

def reorder_columns(df:pd.DataFrame, keep_first:list[str], exclude:list[str]=None):
    if exclude is None:
        exclude = set()
    return df[
        keep_first
        + [col for col in df.columns if col not in keep_first and col not in exclude]
    ]

def remove_matching_rows(df):
    # Identify columns with specific suffixes
    cols_old = [col for col in df.columns if col.endswith('_old')]
    cols_brs = [col for col in df.columns if col.endswith('_brs')]
    cols_ovr = [col for col in df.columns if col.endswith('_ovr')]

    # Assuming there's only one set of each column type based on your example
    if len(cols_old) != 1 or len(cols_brs) != 1 or len(cols_ovr) != 1:
        raise ValueError("Expected exactly one column each for '_old', '_brs', '_ovr'")

    col_old, col_brs, col_ovr = cols_old[0], cols_brs[0], cols_ovr[0]

    # Filter rows where all three column values match
    df_filtered = df[
        ~(df[col_old] == df[col_brs]) | ~(df[col_old] == df[col_ovr])
    ].copy()

    return df_filtered

def clean_inclusion_list(df):
    """
    Processes each row of df:
    1. For each element in 'inclusion_list_brs', if the element is a key in 'ovr_list' and
       its value is 'EXCLUDED', remove the element.
    2. Rows where 'inclusion_list_brs' is empty (or becomes empty after filtering) are dropped.
    """
    def process_row(row):
        inc_list = row.get('inclusion_list_brs', [])
        ovr_list = row.get('ovr_list', {})

        # If inc_list is a NumPy array, convert it to a list.
        if isinstance(inc_list, np.ndarray):
            inc_list = inc_list.tolist()

        # If inc_list is not a list (or array), then treat it as empty.
        if not isinstance(inc_list, list):
            return []

        # For ovr_list: if it's not a dict, then treat it as an empty dict.
        if not isinstance(ovr_list, dict):
            ovr_list = {}

        # Filter out items that are in ovr_list and marked as 'EXCLUDED'
        return [item for item in inc_list if item not in ovr_list or ovr_list[item] != 'EXCLUDED']

    # Apply the row-wise processing.
    df.loc[:,'inclusion_list_brs'] = df.apply(process_row, axis=1)

    # Drop rows where 'inclusion_list_brs' is empty.
    df = df[df['inclusion_list_brs'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

    return df

def pair_elements(input_list):
    """
    Pairs consecutive elements in a list into tuples.
    Parameters: -> input_list : list
    -----------
    Returns: -> list_of_tuples : list. A list where consecutive elements are paired as tuples.
    --------
    Raises TypeError: If the input is not a list and ValueError: If the list does not have an even number of elements.
    """
    if not isinstance(input_list, list):
        raise TypeError("Expected a list as input.")
    if len(input_list) % 2 != 0:
        raise ValueError("The list must have an even number of elements.")

    return [(input_list[i], input_list[i + 1]) for i in range(0, len(input_list), 2)]

def clean_portfolio_and_exclusion_list(row):
    """
    First pairs elements in 'affected_portfolio_str' and filters them based
    on 'exclusion_list_brs'. Then cleans 'exclusion_list_brs' based on the
    filtered results.
    """
    raw_list = row["affected_portfolio_str"]
    exclusion_list = row["exclusion_list_brs"]

    # Pair elements first
    paired = pair_elements(raw_list)

    # Filter tuples based on exclusion list
    cleaned_paired = [tup for tup in paired if tup[1] in exclusion_list]

    # Update affected_portfolio_str
    row["affected_portfolio_str"] = cleaned_paired

    # Extract strategies from the cleaned paired tuples
    affected_strategies = {strategy for _, strategy in cleaned_paired}

    # Update exclusion_list_brs
    row["exclusion_list_brs"] = [
        strategy for strategy in exclusion_list if strategy in affected_strategies
    ]

    return row




In [8]:
# 1.    LOAD DATA
# 1.1.  clarity data
df_1 = load_clarity_data(df_1_path, columns_to_read)
df_2 = load_clarity_data(df_2_path, columns_to_read)

2025-04-25 21:00:32,497 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\202504_df_issuer_level_with_ovr.csv
2025-04-25 21:00:32,991 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\202504_df_issuer_level_with_ovr.csv
2025-04-25 21:00:32,992 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250501_Equities_feed_IssuerLevel_sinOVR.csv
2025-04-25 21:00:33,483 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250501_Equities_feed_IssuerLevel_sinOVR.csv


In [9]:
# let's rename columns in df_1 and df_2 using the rename_dict
df_1.rename(columns=rename_dict, inplace=True)
df_2.rename(columns=rename_dict, inplace=True)

In [10]:
logger.info(
    f"previous clarity df's  rows: {df_1.shape[0]}, new clarity df's rows: {df_2.shape[0]}"
)

2025-04-25 21:00:35,149 - pre-ovr-analysis - INFO - previous clarity df's  rows: 69328, new clarity df's rows: 73401


In [11]:
# 1.2.  aladdin /brs data / perimetros
brs_carteras = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_carteras")    
brs_benchmarks = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_benchmarks")
crosreference = load_crossreference(CROSSREFERENCE_PATH)

2025-04-25 21:00:36,618 - scripts.utils.dataloaders - INFO - Loading portfolio_carteras data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202505_strategies_snt world_portf_bmks.xlsx
2025-04-25 21:00:48,190 - scripts.utils.dataloaders - INFO - Cleaning columns and converting data types for portfolio_carteras
2025-04-25 21:00:48,191 - scripts.utils.dataloaders - INFO - Converting column 'aladdin_id' to string.
2025-04-25 21:00:48,195 - scripts.utils.dataloaders - INFO - Converting column 'portfolio_id' to string.
2025-04-25 21:00:48,197 - scripts.utils.dataloaders - INFO - Successfully loaded Aladdin data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_portf_str\202505_strategies_snt world_portf_bmks.xlsx
2025-04-25 21:00:48,198 - scripts.utils.dataloaders - INFO - Loading portfolio_benchmarks data from C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\bmk_p

In [12]:
# add aladdin_id to df_1 and df_2
logger.info("Adding aladdin_id to clarity dfs")
df_1 = df_1.merge(crosreference[["permid", "aladdin_id"]], on="permid", how="left")
df_2 = df_2.merge(crosreference[["permid", "aladdin_id"]], on="permid", how="left")

2025-04-25 21:01:28,153 - pre-ovr-analysis - INFO - Adding aladdin_id to clarity dfs


In [13]:
# get BRS data at issuer level for becnhmarks without empty aladdin_id
brs_carteras_issuerlevel = get_issuer_level_df(brs_carteras, "aladdin_id")

In [14]:
# get BRS data at issuer level for becnhmarks without empty aladdin_id
brs_benchmarks_issuerlevel = get_issuer_level_df(brs_benchmarks, "aladdin_id")

In [22]:

# print columns datatype
for name, df in zip(["datafeed", "portfolio_df", "bmk_df"],[df_2, brs_carteras_issuerlevel, brs_benchmarks_issuerlevel]):
    print(f"THIS IS A DATAFRAME NAME {name} and below its columns")
    for c in df.columns:
        print(f"'{c}': {df[c].dtype},")
    print("\n\n\n")

THIS IS A DATAFRAME NAME datafeed and below its columns
'isin': object,
'issuer_name': object,
'str_001_s': object,
'str_002_ec': object,
'str_003_ec': object,
'str_004_asec': object,
'str_005_ec': object,
'scs_001_sec': object,
'scs_002_ec': object,
'str_006_sec': object,
'str_sfdr8_aec': object,
'permid': object,
'str_003b_ec': object,
'aladdin_id': object,




THIS IS A DATAFRAME NAME portfolio_df and below its columns
'issuer_name': object,
'aladdin_id': object,
'security_description': object,
'portfolio_full_name': object,
'portfolio_id': object,
'str_001_s': object,
'str_002_ec': object,
'str_003b_ec': object,
'str_003_ec': object,
'str_004_asec': object,
'str_004_asec_sust._bonds': object,
'str_005_ec': object,
'str_006_sec': object,
'str_007_sect': object,
'str_sfdr8_aec': object,
'scs_001_sec': object,
'scs_002_ec': object,
'scs_003_sec': object,




THIS IS A DATAFRAME NAME bmk_df and below its columns
'issuer_name': object,
'aladdin_id': object,
'portfolio_full_name': object

In [None]:
# 1.3 sri/ESG Team data
overrides = load_overrides(OVR_PATH)
# rename column brs_id to aladdin_id
overrides.rename(columns={"brs_id": "aladdin_id"}, inplace=True)
# rename value column "ovr_target" using rename_dict if value is string
overrides["ovr_target"] = overrides["ovr_target"].apply(
    lambda x: pd.NA if isinstance(x, str) and x.strip().lower() in ["na", "nan"]
    else rename_dict[x] if isinstance(x, str) and x in rename_dict
    else x
)

In [16]:
ovr_dict = create_override_dict(overrides)

In [None]:
# Load portfolios & benchmarks dicts
(
    portfolio_dict,
    benchmark_dict,
) = load_portfolios(path_pb=BMK_PORTF_STR_PATH, path_committe=COMMITTEE_PATH)

START PRE-OVR ANALISIS

In [23]:
# 2. PREP DATA FOR ANALYSIS
# make sure that the values of of the columns delta_test_cols are strings and all uppercase and strip
for col in delta_test_cols:
    df_1[col] = df_1[col].str.upper().str.strip()
    df_2[col] = df_2[col].str.upper().str.strip()
    brs_carteras_issuerlevel[col] = brs_carteras_issuerlevel[col].str.upper().str.strip()
    brs_benchmarks_issuerlevel[col] = brs_benchmarks_issuerlevel[col].str.upper().str.strip()

In [24]:
# PREPARE DATA CLARITY LEVEL
(
    df_1, 
    df_2,
    new_issuers_clarity,
    out_issuer_clarity,
) = prepare_dataframes(df_1, df_2)

# log size of new and missing issuers
logger.info(f"Number of new issuers: {new_issuers_clarity.shape[0]}")
logger.info(f"Number of missing issuers: {out_issuer_clarity.shape[0]}")

2025-04-26 18:24:23,687 - pre-ovr-analysis - INFO - Setting index to permid.
2025-04-26 18:24:24,138 - pre-ovr-analysis - INFO - Number of common indexes: 68137
2025-04-26 18:24:24,543 - pre-ovr-analysis - INFO - Number of new issuers: 5265
2025-04-26 18:24:24,544 - pre-ovr-analysis - INFO - Number of missing issuers: 1191


In [None]:
# reset index for new_issuers_clarity and out_issuer_clarity
new_issuers_clarity.reset_index(inplace=True)
out_issuer_clarity.reset_index(inplace=True)

# drop isin from out_issuer_clarity and new_issuers_clarity
out_issuer_clarity.drop(columns=["isin"], inplace=True)
new_issuers_clarity.drop(columns=["isin"], inplace=True)

# remember to remove empyt empyt aladin id
new_issuers_clarity.head()

In [25]:
# PREPARE DATA CARTERAS BRS LEVEL
(
    brs_df, 
    clarity_df,
    in_clarity_but_not_in_brs,
    in_brs_but_not_in_clarity,
) = prepare_dataframes(brs_carteras_issuerlevel, df_2, target_index="aladdin_id")

# log size of new and missing issuers
logger.info(f"Number issuers in clarity but not Aladdin: {in_clarity_but_not_in_brs.shape[0]}")
logger.info(f"Number issuers in Aladdin but not Clarity: {in_brs_but_not_in_clarity.shape[0]}")

2025-04-26 18:24:29,891 - pre-ovr-analysis - INFO - Setting index to aladdin_id.
2025-04-26 18:24:30,217 - pre-ovr-analysis - INFO - Number of common indexes: 2128
2025-04-26 18:24:30,370 - pre-ovr-analysis - INFO - Number issuers in clarity but not Aladdin: 66025
2025-04-26 18:24:30,371 - pre-ovr-analysis - INFO - Number issuers in Aladdin but not Clarity: 1105


In [26]:
# PREPARE DATA BENCHMARK BRS LEVEL
(
    brs_df_benchmarks, 
    clarity_df_benchmarks,
    in_clarity_but_not_in_brs_benchmarks,
    in_brs_benchmark_but_not_in_clarity,
) = prepare_dataframes(brs_benchmarks_issuerlevel, df_2, target_index="aladdin_id")

# log size of new and missing issuers
logger.info(f"Number issuers in clarity but not benchmarks: {in_clarity_but_not_in_brs_benchmarks.shape[0]}")
logger.info(f"Number issuers in benchmarks but not Clarity: {in_brs_benchmark_but_not_in_clarity.shape[0]}")

2025-04-26 18:24:34,622 - pre-ovr-analysis - INFO - Setting index to aladdin_id.
2025-04-26 18:24:34,890 - pre-ovr-analysis - INFO - Number of common indexes: 7595
2025-04-26 18:24:35,017 - pre-ovr-analysis - INFO - Number issuers in clarity but not benchmarks: 60558
2025-04-26 18:24:35,018 - pre-ovr-analysis - INFO - Number issuers in benchmarks but not Clarity: 5731


In [29]:
# print columns datatype
for name, df in zip(["esg_fact_issuer_clarity_equities", "esg_fact_issuer_brs_ptf", "esg_fact_issuer_brs_bmk"],[df_2, brs_df, brs_df_benchmarks]):
    print(f"THIS IS A DATAFRAME NAME {name} and below its columns")
    index_name = df.index.name
    index_type = df.index.dtype
    print(f"'{index_name}': {index_type},")
    for c in df.columns:
        print(f"'{c}': {df[c].dtype},")
    print("\n\n\n")

THIS IS A DATAFRAME NAME esg_fact_issuer_clarity_equities and below its columns
'permid': object,
'isin': object,
'issuer_name': object,
'str_001_s': object,
'str_002_ec': object,
'str_003_ec': object,
'str_004_asec': object,
'str_005_ec': object,
'scs_001_sec': object,
'scs_002_ec': object,
'str_006_sec': object,
'str_sfdr8_aec': object,
'str_003b_ec': object,
'aladdin_id': object,




THIS IS A DATAFRAME NAME esg_fact_issuer_brs_ptf and below its columns
'aladdin_id': object,
'issuer_name': object,
'security_description': object,
'portfolio_full_name': object,
'portfolio_id': object,
'str_001_s': object,
'str_002_ec': object,
'str_003b_ec': object,
'str_003_ec': object,
'str_004_asec': object,
'str_004_asec_sust._bonds': object,
'str_005_ec': object,
'str_006_sec': object,
'str_007_sect': object,
'str_sfdr8_aec': object,
'scs_001_sec': object,
'scs_002_ec': object,
'scs_003_sec': object,




THIS IS A DATAFRAME NAME esg_fact_issuer_brs_bmk and below its columns
'aladdin_id': object,


In [28]:
# get index name and type for these df: [df_2, brs_df, brs_df_benchmarks
for df in [df_2, brs_df, brs_df_benchmarks]:
    index_name = df.index.name
    index_type = df.index.dtype
    print(f"Index name: {index_name}, Index type: {index_type}")

Index name: permid, Index type: object
Index name: aladdin_id, Index type: object
Index name: aladdin_id, Index type: object


In [30]:
# COMPARE DATA
logger.info("comparing clarity dataframes")
delta_clarity = compare_dataframes(df_1, df_2)
delta_clarity = check_new_exclusions(df_1, df_2, delta_clarity)
delta_clarity = check_new_inclusions(df_1, df_2, delta_clarity)
delta_clarity = finalize_delta(delta_clarity)
logger.info("checking impact compared to BRS portfolio data")
delta_brs = compare_dataframes(brs_df, clarity_df)
delta_brs = check_new_exclusions(brs_df, clarity_df, delta_brs, suffix_level="_brs")
delta_brs = check_new_inclusions(brs_df, clarity_df, delta_brs, suffix_level="_brs")
delta_brs = finalize_delta(delta_brs, target_index="aladdin_id")
logger.info("checking impact compared to BRS benchmarks data")
delta_benchmarks = compare_dataframes(brs_df_benchmarks, clarity_df_benchmarks)
delta_benchmarks = check_new_exclusions(brs_df_benchmarks, clarity_df_benchmarks, delta_benchmarks, suffix_level="_brs")
delta_benchmarks = check_new_inclusions(brs_df_benchmarks, clarity_df_benchmarks, delta_benchmarks, suffix_level="_brs")
delta_benchmarks = finalize_delta(delta_benchmarks, target_index="aladdin_id")


2025-04-26 18:59:47,836 - pre-ovr-analysis - INFO - comparing clarity dataframes
2025-04-26 18:59:47,987 - pre-ovr-analysis - INFO - Comparing column: str_001_s
2025-04-26 18:59:48,035 - pre-ovr-analysis - INFO - Comparing column: str_002_ec
2025-04-26 18:59:48,057 - pre-ovr-analysis - INFO - Comparing column: str_003_ec
2025-04-26 18:59:48,087 - pre-ovr-analysis - INFO - Comparing column: str_003b_ec
2025-04-26 18:59:48,118 - pre-ovr-analysis - INFO - Comparing column: str_004_asec
2025-04-26 18:59:48,154 - pre-ovr-analysis - INFO - Comparing column: str_005_ec
2025-04-26 18:59:48,186 - pre-ovr-analysis - INFO - Comparing column: str_006_sec
2025-04-26 18:59:48,220 - pre-ovr-analysis - INFO - Comparing column: str_sfdr8_aec
2025-04-26 18:59:48,243 - pre-ovr-analysis - INFO - Comparing column: scs_001_sec
2025-04-26 18:59:48,286 - pre-ovr-analysis - INFO - Comparing column: scs_002_ec
2025-04-26 18:59:48,322 - pre-ovr-analysis - INFO - Checking for new exclusions in column: str_001_s
2

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
#from utils.zombie_killer import main as zombie_killer
#logger.info("Getting zombie analysis df")
zombie_df = zombie_killer()

In [None]:
# PREP DELTAS BEFORE SAVING
logger.info("Preparing deltas before saving")
# use crossreference to add permid to delta_brs
delta_brs = delta_brs.merge(crosreference[["aladdin_id", "permid"]], on="aladdin_id", how="left")
delta_benchmarks = delta_benchmarks.merge(crosreference[["aladdin_id", "permid"]], on="aladdin_id", how="left")
# drop isin from deltas
delta_clarity.drop(columns=["isin"], inplace=True)
delta_brs.drop(columns=["isin"], inplace=True)
delta_benchmarks.drop(columns=["isin"], inplace=True)
# add new column to delta_brs with ovr_dict value using aladdin_id
delta_brs["ovr_list"] = delta_brs["aladdin_id"].map(ovr_dict)
delta_clarity["ovr_list"] = delta_clarity["aladdin_id"].map(ovr_dict)
delta_benchmarks["ovr_list"] = delta_benchmarks["aladdin_id"].map(ovr_dict)
# let's add portfolio info to the delta_df
delta_clarity = add_portfolio_benchmark_info_to_df(portfolio_dict, delta_clarity)
delta_brs = add_portfolio_benchmark_info_to_df(portfolio_dict, delta_brs)
delta_benchmarks = add_portfolio_benchmark_info_to_df(portfolio_dict, delta_benchmarks)
# let's add benchmark info to the delta_df
delta_clarity = add_portfolio_benchmark_info_to_df(benchmark_dict, delta_clarity, "affected_benchmark_str")
delta_brs = add_portfolio_benchmark_info_to_df(benchmark_dict, delta_brs, "affected_benchmark_str")
delta_benchmarks = add_portfolio_benchmark_info_to_df(benchmark_dict, delta_benchmarks, "affected_benchmark_str")



In [None]:
# let's use filter_non_empty_lists to remove rows with empty lists in affected_portfolio_str
delta_brs = filter_non_empty_lists(delta_brs, "affected_portfolio_str")
# let's use filter_non_empty_lists to remove rows with empty lists in affected_portfolio_str
delta_benchmarks = filter_non_empty_lists(delta_benchmarks, "affected_portfolio_str")

# ADD TEST FOR INCLUSIONS
dlt_inc_brs = delta_brs.copy()
dlt_inc_benchmarks = delta_benchmarks.copy()

# pass filter_rows_with_common_elements for columns exclusion_list_brs and affected_portfolio_str
delta_brs = filter_rows_with_common_elements(delta_brs, "exclusion_list_brs", "affected_portfolio_str")
delta_benchmarks = filter_rows_with_common_elements(delta_benchmarks, "exclusion_list_brs", "affected_portfolio_str")

In [27]:
# get rows from delta_brs where len of the list of in the column affected_benchmark_str is bigger than zero
#delta_brs[delta_brs["affected_benchmark_str"].apply(lambda x: len(x) > 0)] 

In [28]:
# let's reset df1 index to permid
df_1.reset_index(inplace=True)
df_1["permid"] = df_1["permid"].astype(str)

In [29]:
# 6. GET STRATEGIES DFS
str_dfs_dict = {}

# Iterate over strategies to build DataFrames
for strategy in delta_test_cols:
    rows = []

    for _, row in delta_brs.iterrows():
        if strategy in row["exclusion_list_brs"]:
            rows.append({
                "aladdin_id": row["aladdin_id"],
                "permid": row["permid"],
                "issuer_name": row["issuer_name"],
                strategy: row[strategy],
                "affected_portfolio_str": row["affected_portfolio_str"]
            })

    str_dfs_dict[strategy] = pd.DataFrame(rows)

# Prepare lookups for efficient mapping
permid_to_df1 = df_1.set_index("permid")
aladdin_to_brs = brs_carteras_issuerlevel.set_index("aladdin_id")

for strategy_name, df in str_dfs_dict.items():
    # Initialize additional columns
    df[f"{strategy_name}_old"] = None
    df[f"{strategy_name}_brs"] = None
    df[f"{strategy_name}_ovr"] = None

    for i, row in df.iterrows():
        permid = row["permid"]
        aladdin_id = row["aladdin_id"]

        # Lookup from df_1
        if permid in permid_to_df1.index:
            df.at[i, f"{strategy_name}_old"] = permid_to_df1.at[permid, strategy_name]

        # Lookup from brs_carteras_issuerlevel
        if aladdin_id in aladdin_to_brs.index:
            df.at[i, f"{strategy_name}_brs"] = aladdin_to_brs.at[aladdin_id, strategy_name]

        # Lookup from overrides
        match = overrides.loc[
            (overrides["permid"] == permid) & (overrides["ovr_target"] == strategy_name),
            "ovr_value"
        ]
        if not match.empty:
            df.at[i, f"{strategy_name}_ovr"] = match.values[0]

    # Move "affected_portfolio_str" to the end
    cols = [col for col in df.columns if col != "affected_portfolio_str"] + ["affected_portfolio_str"]
    df = df[cols]
    str_dfs_dict[strategy_name] = df

# Reset index and ensure permid is a string for df_1
df_1.reset_index(drop=True, inplace=True)
df_1["permid"] = df_1["permid"].astype(str)


In [30]:
# set id_name_issuers_cols first and exclude delta_test_cols
delta_brs = reorder_columns(delta_brs, id_name_issuers_cols, delta_test_cols)
delta_clarity = reorder_columns(delta_clarity, id_name_issuers_cols, delta_test_cols)
delta_benchmarks = reorder_columns(delta_benchmarks, id_name_issuers_cols, delta_test_cols)
# set id_name_issuers_cols first
new_issuers_clarity = reorder_columns(new_issuers_clarity, id_name_issuers_cols, id_name_cols)
out_issuer_clarity = reorder_columns(out_issuer_clarity, id_name_issuers_cols, id_name_cols)


In [31]:
# ADD TEST FOR INCLUSIONS
dlt_inc_brs = reorder_columns(dlt_inc_brs, id_name_issuers_cols, delta_test_cols)
dlt_inc_benchmarks = reorder_columns(dlt_inc_benchmarks, id_name_issuers_cols, delta_test_cols)


In [32]:
dlt_inc_brs = clean_inclusion_list(dlt_inc_brs)
dlt_inc_benchmarks = clean_inclusion_list(dlt_inc_benchmarks)

In [33]:
for df_name, df in str_dfs_dict.items():
    str_dfs_dict[df_name] = remove_matching_rows(df)

In [34]:
from datetime import datetime
# define a function to save results in an Excel file
def save_excel(df_dict: dict, output_dir: Path, file_name: str) -> Path:
    """
    Writes multiple DataFrames to an Excel file with each DataFrame in a separate sheet.

    Parameters:
    - df_dict (dict): A dictionary where keys are sheet names and values are DataFrames.
    - output_dir (Path): The directory where the Excel file will be saved.
    - file_name (str): The base name for the Excel file.

    Returns:
    - Path: The full path to the saved Excel file.
    """
    # Create a date string in "YYYYMMDD" format
    date_str = datetime.now().strftime("%Y%m%d")

    # Ensure the output directory exists
    logger.info("Creating output directory: %s", output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Construct the full output file path (e.g., file_name_YYYYMMDD.xlsx)
    output_file = output_dir / f"{date_str}_{file_name}.xlsx"

    # Write each DataFrame to its own sheet with index set to False
    with pd.ExcelWriter(output_file) as writer:
        logger.info("Writing DataFrames to Excel file: %s", output_file)
        for sheet_name, df in df_dict.items():
            logger.info("Writing sheet: %s", sheet_name)
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    logger.info("Results saved to Excel file: %s", output_file)

In [35]:
# columns to remove from delta_brs, delta_benchmarks before saving
exclusion_cols_to_remove = ["new_inclusion", "inclusion_list_brs"]
# columns to remove from dlt_inc_brs and dlt_inc_benchmarks before saving
inclusion_cols_to_remove = ["new_exclusion", "exclusion_list_brs"]

# Filter and clean exclusion data
delta_brs = delta_brs.drop(columns=exclusion_cols_to_remove)
delta_brs = delta_brs[delta_brs["new_exclusion"] == True]

delta_benchmarks = delta_benchmarks.drop(columns=exclusion_cols_to_remove)
delta_benchmarks = delta_benchmarks[delta_benchmarks["new_exclusion"] == True]

# Filter and clean inclusion data
dlt_inc_brs = dlt_inc_brs.drop(columns=inclusion_cols_to_remove)
dlt_inc_brs = dlt_inc_brs[dlt_inc_brs["new_inclusion"] == True]

dlt_inc_benchmarks = dlt_inc_benchmarks.drop(columns=inclusion_cols_to_remove)
dlt_inc_benchmarks = dlt_inc_benchmarks[dlt_inc_benchmarks["new_inclusion"] == True]

# Clean delta_clarity
delta_clarity.drop(columns=["new_inclusion", "inclusion_list"], inplace=True)
delta_clarity = delta_clarity[delta_clarity["new_exclusion"] == True]


In [36]:
# Final cleanup: drop 'new_exclusion' and 'new_inclusion' if present
for df in [delta_brs, delta_benchmarks, dlt_inc_brs, dlt_inc_benchmarks, delta_clarity]:
    for col in ["new_exclusion", "new_inclusion"]:
        if col in df.columns:
            df.drop(columns=col, inplace=True)


In [37]:
# create dict of df and df name
dfs_dict = {
    "zombie_analysis": zombie_df,
    "delta_carteras": delta_brs,
    "delta_benchmarks": delta_benchmarks,
    "delta_clarity": delta_clarity,
    "incl_carteras": dlt_inc_brs,
    "incl_benchmarks": dlt_inc_benchmarks,
    "new_issuers_clarity": new_issuers_clarity,
    "out_issuer_clarity": out_issuer_clarity,
}

# add to dfs_dict the str_dfs_dict
dfs_dict.update(str_dfs_dict)


In [38]:
# save to excel
#save_excel(str_dfs_dict, OUTPUT_DIR, file_name="pre_ovr_simple_analysis")


In [None]:
# save to excel
save_excel(dfs_dict, OUTPUT_DIR, file_name="pre_ovr_analysis_beta")


In [None]:
for name, df in dfs_dict.items():
    print(f"THIS IS A DATAFRAME NAME {name} and below its columns")
    for c in df.columns:
        print(f"'{c}': {df[c].dtype},")
    print("\n\n\n")

In [None]:
delta_benchmarks.columns

In [None]:
delta_benchmarks.head()

In [None]:
for k in dfs_dict.keys():
    print(k)