In [1]:
import sys
import warnings
from pathlib import Path
from typing import List, Tuple
from itertools import chain
from collections import defaultdict

# Get the parent directory of current notebook dir, which is the repo root
repo_root = Path.cwd().parent
sys.path.insert(0, str(repo_root))

# Verify the path is correct:
print(f"Added to sys.path: {repo_root}")

import pandas as pd

# Import the centralized configuration
from scripts.utils.config import get_config

Added to sys.path: c:\Users\n740789\Documents\clarity_data_quality_controls


In [2]:
config = get_config("clean_override_db_for_dataplatform", auto_date=False, fixed_date="202505")
logger = config["logger"]
override_dir =  config["SRI_DATA_DIR"] / "overrides"
overrides = config["paths"]["OVR_PATH"]
override_beta_path = override_dir / "overrides_db_beta.xlsx"
override_out_file_path = override_dir / "esg_overrides.csv"

In [3]:
target_cols = [
    'creation_date',
    'last_update_date',
    'next_update_date',
    'permid',
    'brs_id',
    'issuer_name',
    'ovr_target',
    'df_value',
    'ovr_value',
    'ovr_active',
    'ovr_grounds_srating',
    'ovr_grounds_exposure',
    'ovr_grounds_controversy',
    'ovr_grounds_committee',
    'ovr_grounds_inheritance'
]

In [4]:
overrides_beta = pd.read_excel(override_beta_path, usecols=target_cols)
overrides_alpha = pd.read_excel(overrides, usecols=target_cols)

In [5]:
overrides_alpha.ovr_value.value_counts(dropna=False)

ovr_value
OK          873
EXCLUDED    599
FLAG        182
NaN          18
Name: count, dtype: int64

In [6]:
overrides_beta.ovr_value.value_counts(dropna=False)

ovr_value
OK          873
EXCLUDED    599
FLAG        182
NaN          18
Name: count, dtype: int64

In [7]:
check_cols = ['brs_id','permid','issuer_name',]

missing_ovr_val_beta = overrides_beta[overrides_beta.ovr_value.isna()][check_cols].copy()
missing_ovr_val_alpha = overrides_alpha[overrides_alpha.ovr_value.isna()][check_cols].copy()

In [8]:
# are the same missing_ovr_val_alpha and missing_ovr_val_beta?
missing_ovr_val_beta.equals(missing_ovr_val_alpha)


True

In [9]:
for c in overrides_beta.columns:
    print(f"'{c}',")

'creation_date',
'last_update_date',
'next_update_date',
'permid',
'brs_id',
'issuer_name',
'ovr_target',
'df_value',
'ovr_value',
'ovr_active',
'ovr_grounds_srating',
'ovr_grounds_exposure',
'ovr_grounds_controversy',
'ovr_grounds_committee',
'ovr_grounds_inheritance',


In [10]:
# rename brs_id for "issuer_id"
overrides_beta.rename(columns={"brs_id": "issuer_id"}, inplace=True)

In [12]:
overrides_beta.ovr_active.value_counts(dropna=False)

ovr_active
True     1502
False     170
Name: count, dtype: int64

In [13]:
# Let's keep only active overrides
overrides_clean = overrides_beta[overrides_beta.ovr_active].copy()
# Let's keep only overrides with no nan values on ovr_value
overrides_clean = overrides_clean[overrides_clean.ovr_value.notna()].copy()

In [14]:
overrides_clean.ovr_target.unique()

array(['art_8_basicos', 'str_001_s', 'str_002_ec', 'str_003_ec',
       'str_003b_ec', 'str_004_asec', 'str_006_sec', 'cs_003_sec',
       'cs_002_ec', 'cs_001_sec', 'str_005_ec'], dtype=object)

In [15]:
overrides_clean.ovr_value.unique()

array(['EXCLUDED', 'OK', 'FLAG'], dtype=object)

In [16]:
overrides_clean[overrides_clean.ovr_value.isna()]

Unnamed: 0,creation_date,last_update_date,next_update_date,permid,issuer_id,issuer_name,ovr_target,df_value,ovr_value,ovr_active,ovr_grounds_srating,ovr_grounds_exposure,ovr_grounds_controversy,ovr_grounds_committee,ovr_grounds_inheritance


In [17]:
overrides_clean.head()

Unnamed: 0,creation_date,last_update_date,next_update_date,permid,issuer_id,issuer_name,ovr_target,df_value,ovr_value,ovr_active,ovr_grounds_srating,ovr_grounds_exposure,ovr_grounds_controversy,ovr_grounds_committee,ovr_grounds_inheritance
5,2024-09-01,2024-09-01,2025-03-01,8589934175,949740,Wells Fargo & Co,art_8_basicos,OK,EXCLUDED,True,,,,,x
6,2024-09-01,2024-09-01,2025-03-01,8589934312,128005,Credit Agricole SA,str_001_s,EXCLUDED,OK,True,,,,,x
7,2024-09-01,2024-09-01,2025-03-01,8589934312,128005,Credit Agricole SA,str_002_ec,EXCLUDED,OK,True,,,,,x
8,2024-09-01,2024-09-01,2025-03-01,8589934312,128005,Credit Agricole SA,str_003_ec,EXCLUDED,OK,True,,,,,x
9,2024-09-01,2024-09-01,2025-03-01,8589934312,128005,Credit Agricole SA,str_003b_ec,EXCLUDED,OK,True,,,,,x


In [18]:
def generate_identifiers(df):
    # Mapping for ovr_target
    target_map = {
        'art_8_basicos': 'art08',
        'str_001_s': 'str01',
        'str_002_ec': 'str02',
        'str_003_ec': 'str03',
        'str_003b_ec': 'str3b',
        'str_004_asec': 'str04',
        'str_006_sec': 'str06',
        'cs_003_sec': 'cs03',
        'cs_002_ec': 'cs02',
        'cs_001_sec': 'cs01',
        'str_005_ec': 'str05',
    }

    # Mapping for ovr_value
    value_map = {
        'EXCLUDED': 'ex',
        'OK': 'ok',
        'FLAG': 'fl'
    }

    identifiers = []
    counter = 1

    for idx, row in df.iterrows():
        base = 'ovr'

        target_part = target_map.get(row['ovr_target'], 'unk')  # fallback 'unk'
        value_part = value_map.get(row['ovr_value'], 'xx')      # fallback 'xx'
        issuer_part = row['issuer_name'][:3].lower()

        try:
            date_part = pd.to_datetime(row['creation_date']).strftime('%Y%m%d')
        except (ValueError, TypeError):
            date_part = 'yyyymmdd' # fallback for missing dates

        counter_part = f"{counter:03d}"

        identifier = f"{base}{target_part}{value_part}{issuer_part}{date_part}{counter_part}"
        identifiers.append(identifier)

        counter += 1

    return identifiers

In [19]:
# generate override id
overrides_clean['ovr_id'] = generate_identifiers(overrides_clean)
# place override id at the beginning of the dataframe
cols = overrides_clean.columns.tolist()
cols.insert(0, cols.pop(cols.index('ovr_id')))
overrides_clean = overrides_clean[cols]

In [20]:
# save as csv file in override_out_file_path
overrides_clean.to_csv(override_out_file_path, index=False)


In [22]:
# print columns datatype
for c in overrides_clean.columns:
    print(f"'{c}': {overrides_clean[c].dtype},")

'ovr_id': object,
'creation_date': datetime64[ns],
'last_update_date': datetime64[ns],
'next_update_date': datetime64[ns],
'permid': int64,
'issuer_id': object,
'issuer_name': object,
'ovr_target': object,
'df_value': object,
'ovr_value': object,
'ovr_active': bool,
'ovr_grounds_srating': object,
'ovr_grounds_exposure': object,
'ovr_grounds_controversy': object,
'ovr_grounds_committee': object,
'ovr_grounds_inheritance': object,
