In [1]:
import warnings
from datetime import datetime
from pathlib import Path
import os

import pandas as pd
from utils.set_up_log import set_up_log
from utils.dataloaders import (
    load_clarity_data,
    load_aladdin_data,
    load_crossreference
)

In [2]:
# Set up logging
logger = set_up_log("zombie-killer")
# Ignore workbook warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

In [3]:
# DEFINE  CONSTANTS
# TEMP DATE
DATE = "202503"
YEAR = DATE[:4]
DATE_PREV = "202502"

# DEFINE TEST COLUMNS
test_col = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "gp_esccp",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30",
    "art_8_basicos",
    "str_003b_ec",
]

In [4]:
# DEFINE PATHS
REPO_DIR = Path(r"C:\Users\n740789\Documents\clarity_data_quality_controls")
DATAFEED_DIR = Path(r"C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED")
df_1_path = (
    DATAFEED_DIR
    / "datafeeds_with_ovr"
    / f"{DATE_PREV}_df_issuer_level_with_ovr.csv"
)
df_2_path = (
    DATAFEED_DIR
    / "ficheros_tratados"
    / f"{YEAR}"
    / f"{DATE}01_Equities_feed_IssuerLevel_sinOVR.csv"
)
ALADDIN_DATA_DIR = REPO_DIR / "excel_books" / "aladdin_data"
CROSSREFERENCE_PATH = (
    ALADDIN_DATA_DIR
    / "crossreference"
    / f"Aladdin_Clarity_Issuers_{DATE}01.csv"
)
BMK_PORTF_STR_PATH = (
    ALADDIN_DATA_DIR
    / "bmk_portf_str"
    / f"{DATE}_strategies_snt world_portf_bmks.xlsx"
)
SRI_DATA_DIR = REPO_DIR / "excel_books" / "sri_data"
OVR_PATH = (
    REPO_DIR
    / "excel_books"
    / "sri_data"
    / "overrides"
    / "20250318_overrides_db.xlsx"
)

OUTPUT_DIR = SRI_DATA_DIR / "zombie_list"

In [5]:
# LOAD DATA
columns_to_read = ["permid", "isin", "issuer_name"] + test_col
# clarity data
df_2 = load_clarity_data(df_2_path, columns_to_read)

In [6]:
# aladdin /brs data / perimetros
brs_carteras = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_carteras")    
brs_benchmarks = load_aladdin_data(BMK_PORTF_STR_PATH, "portfolio_benchmarks")
crosreference = load_crossreference(CROSSREFERENCE_PATH)

In [7]:
crosreference.columns

Index(['aladdin_id', 'issuer_name', 'permid', 'msci', 'sust'], dtype='object')

In [8]:
# add aladdin_id from crossreference to df_2
df_2 = df_2.merge(
    crosreference[['aladdin_id','permid']], 
    on='permid', 
    how='left'
)

In [9]:
brs_carteras[~(brs_carteras["str_004_asec_sust._bonds"].isna())].head()

Unnamed: 0,issuer_name,aladdin_id,security_description,portfolio_full_name,portfolio_id,str_001_s,str_002_ec,str_003b_ec,str_003_ec,str_004_asec,...,str_007_sect,str_008_sec,str_009_tec,gp_esccp_22,gp_esccp_25,gp_esccp_30,gp_essccp,scs_001_sec,scs_002_ec,scs_003_sec
87,A2A SPA,R44750,A2A SPA RegS,Santander Poupanca Prudente FPR,PFIT0605,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
88,A2A SPA,R44750,A2A SPA RegS,HI-SZVA-5-SFonds,MDTLEMUW,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
89,A2A SPA,R44750,A2A SPA RegS,Seguro PPR+ Conservador,PFS00059,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
90,A2A SPA,R44750,A2A SPA RegS,"SANTANDER RENTA FIJA PRIVADA, FI",FIG01998,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
91,A2A SPA,R44750,A2A SPA RegS,SANTANDER SOSTENIBLE BONOS,FIG05402,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK


In [10]:
brs_carteras[~(brs_carteras["str_004_asec_sust._bonds"].isna())]["str_004_asec_sust._bonds"].value_counts()

str_004_asec_sust._bonds
Ok    824
Name: count, dtype: int64

In [11]:
len(brs_carteras[~(brs_carteras["str_004_asec_sust._bonds"].isna())]["issuer_name"].unique())

22

In [12]:
issuers_instr0004b = brs_carteras[~(brs_carteras["str_004_asec_sust._bonds"].isna())]["issuer_name"].unique()

In [13]:
issuers_instr0004b

array(['A2A SPA', 'ALLIANDER NV', 'ARGENTA SPAARBANK NV', 'BASF SE',
       'CNP ASSURANCES', 'EDP SA', 'EDP FINANCE BV',
       'ELECTRICITE DE FRANCE SA', 'ENBW ENERGIE BADEN WUERTTEMBERG AG',
       'ENBW INTERNATIONAL FINANCE BV', 'ENGIE SA', 'FORTUM OYJ',
       'NATURGY FINANCE BV', 'HERA SPA', 'RWE AG',
       'NATURGY ENERGY GROUP SA', 'NEDERLANDSE GASUNIE NV', 'NESTE OYJ',
       'RCI BANQUE SA', 'SNAM SPA', 'STATKRAFT AS', 'TEREGA SA'],
      dtype=object)

In [14]:
brs_carteras.columns.intersection(df_2.columns)

Index(['issuer_name', 'aladdin_id', 'str_001_s', 'str_002_ec', 'str_003b_ec',
       'str_003_ec', 'str_004_asec', 'str_005_ec', 'str_006_sec',
       'str_007_sect', 'gp_esccp_22', 'gp_esccp_25', 'gp_esccp_30'],
      dtype='object')

In [15]:
merging_cols = ['aladdin_id', 'str_001_s', 'str_002_ec', 'str_003b_ec',
       'str_003_ec', 'str_004_asec', 'str_005_ec', 'str_006_sec',
       'str_007_sect', 'gp_esccp_22', 'gp_esccp_25', 'gp_esccp_30']

In [16]:
brs_carteras.head()

Unnamed: 0,issuer_name,aladdin_id,security_description,portfolio_full_name,portfolio_id,str_001_s,str_002_ec,str_003b_ec,str_003_ec,str_004_asec,...,str_007_sect,str_008_sec,str_009_tec,gp_esccp_22,gp_esccp_25,gp_esccp_30,gp_essccp,scs_001_sec,scs_002_ec,scs_003_sec
0,SNT-WORLD,,,,,,,,,,...,,,,,,,,,,
1,11 BIT STUDIOS SA,F79892,11 BIT STUDIOS SA,Santander Prestiz Technologii i Innowacji (San...,PLSFIO0319,Ok,Ok,Ok,Ok,Ok,...,OK,,,OK,Ok,Ok,OK,OK,OK,OK
2,2I RETE GAS SPA,G70839,2I RETE GAS SPA,02.018.59973438020.0,PFC00659,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
3,2I RETE GAS SPA,G70839,2I RETE GAS SPA,ALDROVI S.L.,CPE05455,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK
4,2I RETE GAS SPA,G70839,2I RETE GAS SPA,02.018.62238191020.0,PFC00665,Ok,Ok,Ok,Ok,Excluded,...,Excluded,,,OK,Ok,Ok,OK,OK,OK,OK


In [17]:
# remove rows with no aladdin_id
brs_carf  = brs_carteras[~(brs_carteras.aladdin_id.isna())].copy()

In [18]:
# remove duplicate at the issuer level
#brs_carf.drop_duplicates(subset='aladdin_id', inplace=True)

In [19]:
zombie_df = brs_carf.merge(df_2[merging_cols],
                                 on='aladdin_id',
                                 how='left',
                                 suffixes=("_brs", "_df"))

In [20]:
def mark_zombies(df, merging_cols):
    
    def get_zombie_columns(row):
        # List to hold the base names where _brs has a value but _df is NaN.
        zombie_cols = []
        for col in merging_cols:
            col_brs = f"{col}_brs"
            col_df  = f"{col}_df"
            # Check that both columns exist in the row (they should if the merge worked as expected)
            if col_brs in row and col_df in row:
                # if _brs is not NaN and _df is NaN, add the column name (without suffix)
                if pd.notna(row[col_brs]) and pd.isna(row[col_df]):
                    zombie_cols.append(col)
        return zombie_cols

    # Create the zombie_list column by applying the function to each row
    df['zombie_list'] = df.apply(get_zombie_columns, axis=1)
    # Create the zombie_flag column: True if the zombie_list is not empty
    df['zombie_flag'] = df['zombie_list'].apply(lambda lst: len(lst) > 0)
    
    return df

In [21]:
zombie_df = mark_zombies(zombie_df, merging_cols)


In [22]:
# order columns
columns_order = [
    # General identifiers
    "issuer_name", "aladdin_id", "security_description", "portfolio_full_name", "portfolio_id",

    # str scores (BRS then DF)
    "str_001_s_brs", "str_001_s_df",
    "str_002_ec_brs", "str_002_ec_df",
    "str_003b_ec_brs", "str_003b_ec_df",
    "str_003_ec_brs", "str_003_ec_df",
    "str_004_asec_brs", "str_004_asec_df", "str_004_asec_sust._bonds",
    "str_005_ec_brs", "str_005_ec_df",
    "str_006_sec_brs", "str_006_sec_df",
    "str_007_sect_brs", "str_007_sect_df",
    "str_008_sec", "str_009_tec",

    # gp scores
    "gp_esccp_22_brs", "gp_esccp_22_df",
    "gp_esccp_25_brs", "gp_esccp_25_df",
    "gp_esccp_30_brs", "gp_esccp_30_df",
    "gp_essccp",

    # scs scores
    "scs_001_sec", "scs_002_ec", "scs_003_sec",

    # zombie
    "zombie_flag", "zombie_list"
]

zombie_df = zombie_df[columns_order]


In [23]:
# keep only rows with zombie_flag TRUE
zombie_df = zombie_df[zombie_df.zombie_flag]

In [24]:
zombie_df.head()

Unnamed: 0,issuer_name,aladdin_id,security_description,portfolio_full_name,portfolio_id,str_001_s_brs,str_001_s_df,str_002_ec_brs,str_002_ec_df,str_003b_ec_brs,...,gp_esccp_25_brs,gp_esccp_25_df,gp_esccp_30_brs,gp_esccp_30_df,gp_essccp,scs_001_sec,scs_002_ec,scs_003_sec,zombie_flag,zombie_list
363,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA,"SANTANDER RENTA FIJA FLOTANTE, FI",FIG05292,Ok,,Ok,,Ok,...,Ok,,Ok,,OK,OK,OK,OK,True,"[str_001_s, str_002_ec, str_003b_ec, str_003_e..."
364,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA,HI-KAPPA-11-SFonds,MDTLEMUB,Ok,,Ok,,Ok,...,Ok,,Ok,,OK,OK,OK,OK,True,"[str_001_s, str_002_ec, str_003b_ec, str_003_e..."
365,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA,HI-SZVA-5-SFonds,MDTLEMUW,Ok,,Ok,,Ok,...,Ok,,Ok,,OK,OK,OK,OK,True,"[str_001_s, str_002_ec, str_003b_ec, str_003_e..."
366,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA,"SANTANDER CORTO PLAZO, FI",FIG05698,Ok,,Ok,,Ok,...,Ok,,Ok,,OK,OK,OK,OK,True,"[str_001_s, str_002_ec, str_003b_ec, str_003_e..."
367,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA,"SANTANDER AHORRO 7, FP",FPG00008,Ok,,Ok,,Ok,...,Ok,,Ok,,OK,OK,OK,OK,True,"[str_001_s, str_002_ec, str_003b_ec, str_003_e..."


In [25]:
def group_by_security_description(df):
    """
    Group by 'security_description' and create:
      - 'portfolio_list': list of all unique portfolio_full_names per security
      - 'portfolio_id_list': list of all unique portfolio_ids per security
      - Keep 'issuer_name' (first occurrence) and 'aladdin_id' (first occurrence)
      - Combine all 'zombie_list' entries (which are lists) into one list and rename it as 'strategy_list'
    """
    def flatten_lists(series):
        """
        Given a pandas Series where each element is a list, flatten them into a single list
        and return unique values while preserving their order.
        """
        flattened = []
        for item in series:
            # Check if the item is a list; if not, treat it as a single element.
            if isinstance(item, list):
                flattened.extend(item)
            else:
                flattened.append(item)
        # Remove duplicates while preserving order
        seen = set()
        unique_items = []
        for i in flattened:
            if i not in seen:
                seen.add(i)
                unique_items.append(i)
        return unique_items

    agg_dict = {
        # Assuming issuer_name and aladdin_id are the same for a given security_description
        "issuer_name": "first",
        "aladdin_id":  "first",
        # Collect unique portfolio names and ids
        "portfolio_full_name": lambda x: list(x.unique()),
        "portfolio_id":        lambda x: list(x.unique()),
        # For zombie_list, use the custom flatten function
        "zombie_list": lambda x: flatten_lists(x)
    }

    # Group the DataFrame
    grouped_df = df.groupby("security_description", as_index=False).agg(agg_dict)

    # Rename columns accordingly
    grouped_df.rename(
        columns={
            "portfolio_full_name": "portfolio_list",
            "portfolio_id":        "portfolio_id_list",
            "zombie_list":         "strategy_list",
        },
        inplace=True
    )

    # Reorder columns: issuer_name, aladdin_id, security_description, strategy_list, portfolio_list, portfolio_id_list
    final_columns = [
        "issuer_name",
        "aladdin_id",
        "security_description",
        "strategy_list",
        "portfolio_list",
        "portfolio_id_list"
    ]
    grouped_df = grouped_df[final_columns]

    return grouped_df

In [26]:
# groubby security to summarise information
zombie_grouped = group_by_security_description(zombie_df)
zombie_grouped.head()


Unnamed: 0,issuer_name,aladdin_id,security_description,strategy_list,portfolio_list,portfolio_id_list
0,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA,"[str_001_s, str_002_ec, str_003b_ec, str_003_e...","[SANTANDER RENTA FIJA FLOTANTE, FI, HI-KAPPA-1...","[FIG05292, MDTLEMUB, MDTLEMUW, FIG05698, FPG00..."
1,ACCIONA ENERGIA FINANCIACION FILIALES SA,J88119,ACCIONA ENERGIA FINANCIACION FILIA MTN RegS,"[str_001_s, str_002_ec, str_003b_ec, str_003_e...","[02.018.62238191020.0, HI-SZVA-5-SFonds, HI-KA...","[PFC00665, MDTLEMUW, MDTLEMUB, CPE05440, PFC00..."
2,AEROPUERTOS ARGENTINA 2000 SA,D17734,AEROPUERTOS ARGENTINA 2000 SA CLAS,"[str_001_s, str_002_ec, str_003_ec, str_004_as...","[Superfondo Combinado F.C.I., Supergestion F.C...","[COMBINAD, SGI]"
3,PENDING,B96019,ALUAR ALUMINIO ARGENTINO S.9,"[gp_esccp_22, gp_esccp_25, gp_esccp_30]",[Superfondo Estrategico F.C.I.],[SUPESTRA]
4,PENDING,B96019,AMUNDI MSCI WLD ESG CL NZR AMBTN C,"[gp_esccp_22, gp_esccp_25, gp_esccp_30]",[Fondo Sam Renta Variable 26 SA de CV Fondo de...,[SAM-FTW]


In [27]:
zombie_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1861 entries, 363 to 59411
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   issuer_name               1861 non-null   object
 1   aladdin_id                1861 non-null   object
 2   security_description      1861 non-null   object
 3   portfolio_full_name       1861 non-null   object
 4   portfolio_id              1861 non-null   object
 5   str_001_s_brs             1835 non-null   object
 6   str_001_s_df              0 non-null      object
 7   str_002_ec_brs            1835 non-null   object
 8   str_002_ec_df             0 non-null      object
 9   str_003b_ec_brs           467 non-null    object
 10  str_003b_ec_df            0 non-null      object
 11  str_003_ec_brs            1835 non-null   object
 12  str_003_ec_df             0 non-null      object
 13  str_004_asec_brs          658 non-null    object
 14  str_004_asec_df           

In [28]:
# Create the directory if it does not exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created directory: {OUTPUT_DIR}")
else:
    print(f"Directory already exists: {OUTPUT_DIR}")


Directory already exists: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\zombie_list


In [29]:
# Create a datetime string, e.g., '2025-03-23'
DATE = datetime.now().strftime("%Y%m%d")

# Define the full path for the output file
OUTPUT_FILE = OUTPUT_DIR / f"{DATE}_zombie_analysis.csv"

zombie_grouped.to_csv(OUTPUT_FILE, index=False)
