In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
from typing import List


In [2]:
ovr_db = pd.read_excel(r"C:\Users\n740789\Documents\Projects_local\DataSets\overrides\202502_BBDD_Overrides.xlsx",
                   sheet_name="OVR DB")

In [3]:
# read latest datafeed at the issuer level
target_columns = [
    "issuer_name",
    "permid",
    "clarityid",
    "company_inheriting",
    "parent_company",
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "art_8_basicos"
]

datafeed = pd.read_csv(
    r"C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\ficheros_tratados\2025\20250201_Equities_feed_IssuerLevel_sinOVR.csv",
    usecols=target_columns,
    dtype={"permid": str, "clarityid": str}
)

In [4]:
# rename df column names

col_dict = {
    "str_001_s" : "str_001",
    "str_002_ec" : "str_002",
    "str_003_ec" : "str_003",
    "str_003b_ec" : "str_003b",
    "str_004_asec" : "str_004",
    "str_005_ec" : "str_005",
    "cs_001_sec" : "cs_001",
    "cs_003_sec" : "cs_003",
    "cs_002_ec" : "cs_002",
    "str_006_sec" : "str_006",
    "art_8_basicos" : "art_8" 
}

datafeed.rename(columns=col_dict, inplace=True)

In [5]:
ovr_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 69 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   #                                            750 non-null    int64  
 1   Fecha aplicación OVR                         750 non-null    object 
 2   Fecha última revisión                        750 non-null    object 
 3   Próxima revisión                             748 non-null    object 
 4   ClarityID                                    750 non-null    int64  
 5   permId                                       750 non-null    int64  
 6   Sustainalytics ID                            750 non-null    object 
 7   IssuerID                                     747 non-null    object 
 8   Issuer Name                                  750 non-null    object 
 9   SNT-WORLD
(Y/-)                              735 non-null    object 
 10  PA

In [6]:
# stadarise spelling of columns
ovr_db.columns = ovr_db.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('\n','_').str.replace("*","")

# rename columns
ovr_db.rename(columns={
    'fecha_aplicación_ovr':'ovr_date',
    'fecha_última_revisión':'last_check_date',
    'próxima_revisión':'next_check_date',
    'snt-world_y/-':'snt_world',
    'sr':'ovr_grounds_srating',
    'exp':'ovr_grounds_exposure',
    'ctrv':'ovr_grounds_controversy',
    'comit':'ovr_grounds_committee',
    'detalle_ovr/observación_adicional':'comments',
    'temática_controversia_sustainalytics':'sustainalytics_controv_detail',
    'exposiciones':'exposure_detail',
    'estatus_actual_revisión':'latest_comment'
}, inplace=True)


ovr_col_dict = {
    "ovr_str001_sec" : "ovr_str001",
    "ovr_str002_ec" : "ovr_str002",
    "ovr_str003_ec" : "ovr_str003",
    "ovr_str003b_ec" : "ovr_str003b",
    "ovr_str004_sec" : "ovr_str004",
    "ovr_str005_ec" : "ovr_str005",
    "ovr_str006_sec" : "ovr_str006",
    "ovr_articulo_8_c" : "ovr_art_8",
    "ovr_cs001_sec" : "ovr_cs001",
    "ovr_cs002_ec" : "ovr_cs002",
    "ovr_cs003_sec" : "ovr_cs003"
}

# rename targe ovr col name
ovr_db.rename(columns=ovr_col_dict, inplace=True)

# make sure column permid is str
ovr_db['permid'] = ovr_db['permid'].astype(str)
ovr_db['clarityid'] = ovr_db['clarityid'].astype(str)


# parse ovr_date, last_check_date, next_check_date to datetime objects
ovr_db['ovr_date'] = pd.to_datetime(ovr_db['ovr_date'], errors='coerce')
ovr_db['last_check_date'] = pd.to_datetime(ovr_db['last_check_date'], errors='coerce')
ovr_db['next_check_date'] = pd.to_datetime(ovr_db['next_check_date'], errors='coerce') 

  ovr_db['ovr_date'] = pd.to_datetime(ovr_db['ovr_date'], errors='coerce')


In [7]:
# joint comment columns into a single one
ovr_db['comments'] = (
    ovr_db['comments'].fillna('') + '\n\n' +
    'latest_comment:\n' + ovr_db['latest_comment'].fillna('').astype(str) + '\n\n' +
    'sustainalytics_controv_detail:\n' + ovr_db['sustainalytics_controv_detail'].fillna('').astype(str) + '\n\n' +
    'exposure_detail:\n' + ovr_db['exposure_detail'].fillna('').astype(str)
)


In [8]:
ovr_ground_cols = ['ovr_grounds_srating', 'ovr_grounds_exposure', 'ovr_grounds_controversy', 'ovr_grounds_committee']
for col in ovr_ground_cols:
    ovr_db[col] = (
        ovr_db[col]
        .astype("string")  # Ensures Pandas' string dtype (not object)
        .str.strip()
        .str.replace(" ", "", regex=False)
        .str.lower()
        .str.replace("c", "x", regex=False)
        .replace(["", "nan", "<NA>", "<na>"], np.nan)  # Unify missing values
    )

In [9]:
[ovr_db[f'{col}'].value_counts(dropna=False) for col in ovr_ground_cols]

[ovr_grounds_srating
 <NA>    441
 x       309
 Name: count, dtype: Int64,
 ovr_grounds_exposure
 <NA>    562
 x       188
 Name: count, dtype: Int64,
 ovr_grounds_controversy
 <NA>    668
 x        82
 Name: count, dtype: Int64,
 ovr_grounds_committee
 <NA>    531
 x       219
 Name: count, dtype: Int64]

Remove unecesary columns and generate fileterd dfs

In [10]:
# let's get rig of str_007
# drop columns with indices 38,39, 40
ovr_db_filtered = ovr_db.drop(ovr_db.columns[[38, 39, 40]], axis=1).copy()

In [11]:
# List of columns to process
ovr_col_list = [
    "ovr_str001",
    "ovr_str002",
    "ovr_str003",
    "ovr_str003b",
    "ovr_str004",
    "ovr_str005",
    "ovr_str006",
    "ovr_art_8",
    "ovr_cs001",
    "ovr_cs002",
    "ovr_cs003"
]

Display results

In [12]:
print(f"There are {len(ovr_db.permid.unique())} unique permids while the df has {len(ovr_db)} rows")

print("display overrides by year month")
print(ovr_db.ovr_date.dt.strftime('%Y_%m').value_counts(dropna=False).sort_index())


print("display overrides by strategy")
[print(ovr_db[col].value_counts(dropna=True, normalize=True).round(2), '\n') for col in ovr_col_list]

There are 746 unique permids while the df has 750 rows
display overrides by year month
ovr_date
2022_02      2
2022_06     13
2022_07      1
2022_08      1
2022_09      2
2022_10     12
2022_11    162
2022_12      3
2023_02      4
2023_03      3
2023_04      3
2023_06      7
2023_08      2
2023_10      2
2023_11      9
2023_12      3
2024_01      4
2024_02     25
2024_04      2
2024_05    139
2024_06      2
2024_07      2
2024_08      5
2024_10     44
2024_12      1
2025_02      3
NaN        294
Name: count, dtype: int64
display overrides by strategy
ovr_str001
OK          0.70
FLAG        0.16
EXCLUDED    0.14
Name: proportion, dtype: float64 

ovr_str002
EXCLUDED    0.52
OK          0.48
Name: proportion, dtype: float64 

ovr_str003
OK          0.61
EXCLUDED    0.25
FLAG        0.14
Name: proportion, dtype: float64 

ovr_str003b
OK          0.82
EXCLUDED    0.12
FLAG        0.05
Name: proportion, dtype: float64 

ovr_str004
OK          0.77
FLAG        0.13
EXCLUDED    0.10
Name: pro

[None, None, None, None, None, None, None, None, None, None, None]

In [13]:
# merge df_filtered with datafeed on permid
ovr_db_datafeed = pd.merge(ovr_db_filtered, datafeed, on='clarityid', how='left')

In [14]:
print(ovr_db_datafeed.shape[0])

750


In [15]:
ovr_db_datafeed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 81 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   #                                           750 non-null    int64         
 1   ovr_date                                    456 non-null    datetime64[ns]
 2   last_check_date                             564 non-null    datetime64[ns]
 3   next_check_date                             482 non-null    datetime64[ns]
 4   clarityid                                   750 non-null    object        
 5   permid_x                                    750 non-null    object        
 6   sustainalytics_id                           750 non-null    object        
 7   issuerid                                    747 non-null    object        
 8   issuer_name_x                               750 non-null    object        
 9   snt_world 

In [16]:
def drop_columns_by_prefix(df, prefixes):
    columns_to_drop = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]
    return df.drop(columns=columns_to_drop)

# Drop columns starting with "original_" or "final_"
prefixes_to_remove = ["original_", "final_"]
ovr_db_datafeed = drop_columns_by_prefix(ovr_db_datafeed, prefixes_to_remove)

In [17]:
#rename column permid_x and issuer_name_x df_datafeed to permid and issuer_name
ovr_db_datafeed.rename(columns={
    'permid_x':'permid',
    'issuer_name_x':'issuer_name'
}, inplace=True)

result_target_columns = [
    "last_check_date",
    "next_check_date",
    'permid',
    "clarityid",
    'issuer_name',
    "company_inheriting",
    "parent_company",
    "ovr_str001",
    "ovr_str002",
    "ovr_str003",
    "ovr_str003b",
    "ovr_str004",
    "ovr_str005",
    "ovr_str006",
    "ovr_cs001",
    "ovr_cs002",
    "ovr_cs003",
    "ovr_art_8",
    "str_001",
    "str_002",
    "str_003",
    "str_003b",
    "str_004",
    "str_005",
    "str_006",
    "art_8",
    "cs_001",
    "cs_003",
    "cs_002",
    "ovr_grounds_srating",
    "ovr_grounds_exposure",
    "ovr_grounds_controversy",
    "ovr_grounds_committee",
    "comments",
]


# filter df_datafeed to only include the columns in result_target_columns
result_df = ovr_db_datafeed[result_target_columns].copy()


In [18]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   last_check_date          564 non-null    datetime64[ns]
 1   next_check_date          482 non-null    datetime64[ns]
 2   permid                   750 non-null    object        
 3   clarityid                750 non-null    object        
 4   issuer_name              750 non-null    object        
 5   company_inheriting       746 non-null    object        
 6   parent_company           746 non-null    object        
 7   ovr_str001               99 non-null     object        
 8   ovr_str002               85 non-null     object        
 9   ovr_str003               79 non-null     object        
 10  ovr_str003b              74 non-null     object        
 11  ovr_str004               102 non-null    object        
 12  ovr_str005               50 non-null

In [19]:
def active_ovr(row, column_pairs):
    result = {}
    for ovr_col, base_col in column_pairs:
        if pd.isna(row[ovr_col]) or pd.isna(row[base_col]):
            result[f"{ovr_col}_active"] = np.nan
        else:
            result[f"{ovr_col}_active"] = row[ovr_col] != row[base_col]
    return result

In [20]:
# Define column pairs to compare based on the provided columns
column_pairs_to_compare = [
    ("ovr_str001", "str_001"),
    ("ovr_str002", "str_002"),
    ("ovr_str003", "str_003"),
    ("ovr_str003b", "str_003b"),
    ("ovr_str004", "str_004"),
    ("ovr_str005", "str_005"),
    ("ovr_str006", "str_006"),
    ("ovr_cs001", "cs_001"),
    ("ovr_cs002", "cs_002"),
    ("ovr_cs003", "cs_003"),
    ("ovr_art_8", "art_8"),
]

# Add new columns for each override
for index, row in result_df.iterrows():
    active_overrides = active_ovr(row, column_pairs_to_compare)
    for col, value in active_overrides.items():
        result_df.at[index, col] = value

  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value
  result_df.at[index, col] = value


In [21]:
# Define column pairs to compare based on the provided columns
column_pairs_to_compare = [
    ("ovr_str001", "str_001"),
    ("ovr_str002", "str_002"),
    ("ovr_str003", "str_003"),
    ("ovr_str003b", "str_003b"),
    ("ovr_str004", "str_004"),
    ("ovr_str005", "str_005"),
    ("ovr_str006", "str_006"),
    ("ovr_cs001", "cs_001"),
    ("ovr_cs002", "cs_002"),
    ("ovr_cs003", "cs_003"),
    ("ovr_art_8", "art_8"),
]

In [24]:
def save_result_df_to_excel(result_df, output_file, basic_columns):

    aux_col = [
    "ovr_grounds_srating",
    "ovr_grounds_exposure",
    "ovr_grounds_controversy",
    "ovr_grounds_committee",
    "comments",
    "company_inheriting",
    "parent_company",
    ]


    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    strategy_columns = [
        "str_001", "str_002", "str_003", "str_003b", 
        "str_004", "str_005","str_006", "art_8",
        "cs_001", "cs_002", "cs_003" 
    ]

    ovr_columns = [
        "ovr_str001", "ovr_str002", "ovr_str003", "ovr_str003b", 
        "ovr_str004", "ovr_str005", "ovr_str006", "ovr_art_8",
        "ovr_cs001", "ovr_cs002", "ovr_cs003"
    ]

    ovr_active_columns = [
        "ovr_str001_active", "ovr_str002_active", "ovr_str003_active", "ovr_str003b_active",
        "ovr_str004_active", "ovr_str005_active", "ovr_str006_active", "ovr_art_8_active",
        "ovr_cs001_active", "ovr_cs002_active", "ovr_cs003_active", 
    ]

    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        result_df.to_excel(writer, sheet_name="Full_Data", index=False)

        for strategy, ovr_col, active_col in zip(strategy_columns, ovr_columns, ovr_active_columns):
            result_df["ovr_target"] = strategy
            relevant_columns = basic_columns + ["ovr_target"] + [
                col for col in [strategy, ovr_col, active_col] if col in result_df.columns
            ] + aux_col

            if relevant_columns:
                result_df[relevant_columns].to_excel(writer, sheet_name=f"{strategy}", index=False)

    print(f"Data has been successfully saved to {output_file}")



In [None]:
def ovr_to_dfs(result_df, output_file, basic_columns) -> List[pd.DataFrame]:

    aux_col = [
    "ovr_grounds_srating",
    "ovr_grounds_exposure",
    "ovr_grounds_controversy",
    "ovr_grounds_committee",
    "comments",
    "company_inheriting",
    "parent_company",
    ]


    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    strategy_columns = [
        "str_001", "str_002", "str_003", "str_003b", 
        "str_004", "str_005","str_006", "art_8",
        "cs_001", "cs_002", "cs_003" 
    ]

    ovr_columns = [
        "ovr_str001", "ovr_str002", "ovr_str003", "ovr_str003b", 
        "ovr_str004", "ovr_str005", "ovr_str006", "ovr_art_8",
        "ovr_cs001", "ovr_cs002", "ovr_cs003"
    ]

    ovr_active_columns = [
        "ovr_str001_active", "ovr_str002_active", "ovr_str003_active", "ovr_str003b_active",
        "ovr_str004_active", "ovr_str005_active", "ovr_str006_active", "ovr_art_8_active",
        "ovr_cs001_active", "ovr_cs002_active", "ovr_cs003_active", 
    ]

    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        result_df.to_excel(writer, sheet_name="Full_Data", index=False)

        for strategy, ovr_col, active_col in zip(strategy_columns, ovr_columns, ovr_active_columns):
            result_df["ovr_target"] = strategy
            relevant_columns = basic_columns + ["ovr_target"] + [
                col for col in [strategy, ovr_col, active_col] if col in result_df.columns
            ] + aux_col

            if relevant_columns:
                result_df[relevant_columns].to_excel(writer, sheet_name=f"{strategy}", index=False)

    print(f"Data has been successfully saved to {output_file}")



In [26]:
DATE = datetime.now().strftime("%Y%m%d")

# define basic columns before saving
basic_columns = [
    "last_check_date",
    "next_check_date",
    "permid",
    "issuer_name",
]
save_result_df_to_excel(result_df, rf"output\{DATE}_override_analysis_issuer.xlsx", basic_columns=basic_columns)

Data has been successfully saved to output\20250217_override_analysis.xlsx


In [None]:
result_df.head()

In [33]:
columns_to_check = [
    'ovr_grounds_srating',
    'ovr_grounds_exposure',
    'ovr_grounds_controversy',
    'ovr_grounds_committee'
]

filter_test = result_df[result_df[columns_to_check].notna().sum(axis=1)>1]

In [None]:
filter_test[["issuer_name"]+columns_to_check].head(10)

In [None]:
for col in result_df.columns:
    if col.endswith("_active"):
        print(result_df[col].value_counts(dropna=True))
        print("\n")