In [1]:
import pandas as pd
import numpy as np

import sys
import os


# Add the project root (one level up from notebooks/) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from scripts.utils.dataloaders import (
    load_clarity_data,
    load_crossreference,
    load_overrides,
)
from scripts.utils.config import get_config
from scripts.utils.clarity_data_quality_control_functions import get_issuer_level_df, pad_identifiers

In [5]:
# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config(
    "notebook-datafeeds-explorer", 
    interactive=False,
    auto_date= False,
    fixed_date= "202506",)


logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]
# Use the paths from config
DF_PREV_PATH = paths["PRE_DF_WOVR_PATH"]
DF_NEW_PATH = paths["CURRENT_DF_WOUTOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]

In [6]:
target_permid = [
"4295869482",
"4296555278",
"4296358021",
]
target_cols = ["permid", "aladdin_id","issuer_name"]
targat_cols_brs = ["aladdin_id", "issuer_name"]
target_cols_clarity = ["permid", "issuer_name", "clarityid"]
target_cols_ovr = ["aladdin_id", "issuer_name", "clarityid", "permid", "ovr_target", "ovr_value","ovr_active"]
target_aladdin_id = [
    "000375",
    "003001",
    "007699",
    "010199",
    "055262",
    "059456",
    "072730",
    "M58534",
    "R48483",
    "F05671"]

In [7]:
override = load_overrides(OVR_PATH, target_cols=target_cols_ovr, drop_active=False)

2025-06-04 12:14:48,240 - scripts.utils.dataloaders - INFO - Loading overrides from: C:\Users\n740789\Documents\esg-sri-repos\clarity_data_quality_controls\excel_books\sri_data\overrides\overrides_db.xlsx


In [8]:
override_beta = pd.read_csv(r"C:\Users\n740789\Documents\esg-sri-repos\clarity_data_quality_controls\excel_books\sri_data\overrides\20250604_202506_overrides_db_beta.csv")

In [9]:
override["aladdin_id"] = pad_identifiers(override["aladdin_id"])
override_beta["aladdin_id"] = pad_identifiers(override_beta["aladdin_id"])

In [10]:
override_beta.head()

Unnamed: 0,clarityid,permid,aladdin_id,issuer_name,ovr_target,df_value,ovr_value,ovr_active,ultimate_issuer_id
0,18621.0,4295895000.0,375,BP PLC,str_002_ec,OK,EXCLUDED,True,
1,,4295895000.0,2800,SANTANDER UK PLC,str_005_ec,,OK,True,
2,,4296457000.0,2801,SANTANDER FINANCIAL SERVICES PLC,str_005_ec,,OK,True,
3,27220.0,4295903000.0,2824,Abbott Laboratories,str_001_s,FLAG,OK,True,
4,27220.0,4295903000.0,2824,Abbott Laboratories,str_003_ec,FLAG,OK,True,


In [11]:
override.head()

Unnamed: 0,clarityid,permid,aladdin_id,issuer_name,ovr_target,ovr_value,ovr_active
0,18621.0,4295894740,375,BP PLC,str_002_ec,EXCLUDED,True
1,,4295895363,2800,SANTANDER UK PLC,str_005_ec,OK,True
2,,4296457498,2801,SANTANDER FINANCIAL SERVICES PLC,str_005_ec,OK,True
3,27220.0,4295903265,2824,Abbott Laboratories,str_001_s,OK,True
4,27220.0,4295903265,2824,Abbott Laboratories,str_003_ec,OK,True


In [None]:
override.info()

In [7]:
missing_permid = override[override["permid"].isna()][["aladdin_id", "issuer_name"]].copy()

In [8]:
missing_permid.drop_duplicates(subset=["aladdin_id"], inplace=True)

In [9]:
missing_permid.sort_values(by=["issuer_name"], inplace=True)

In [10]:
missing_permid.rename(columns={"aladdin_id": "aladdin_issuer"}, inplace=True)

In [11]:
missing_permid.to_csv(r"C:\Users\n740789\Downloads\missing_permid.csv", index=False)

In [None]:
datafeed = load_clarity_data(DF_NEW_PATH, target_cols=target_cols_clarity)
datafeed.loc[:,"permid"] = datafeed["permid"].astype(str)

In [None]:
crossreference = load_crossreference(CROSSREFERENCE_PATH)

In [None]:
crossreference.info()

In [15]:
cross_filter = crossreference[crossreference["permid"].isin(target_permid)]
datafeed_filter = datafeed[datafeed["permid"].isin(target_permid)]
override_filter = override[override["permid"].isin(target_permid)]

In [None]:
cross_filter.set_index("aladdin_id").head()

In [None]:
datafeed_filter.set_index("permid").head()

In [18]:
(override_filter.sort_values(by=["aladdin_id", "ovr_target"])).to_excel(r"C:\Users\n740789\Downloads\info_target_permid_deutsche_overrides.xlsx", index=False)

In [None]:
override.info()

In [20]:
def find_conflicting_permid(df: pd.DataFrame) -> pd.DataFrame:
    # Step 1: Count unique permids per aladdin_id
    permid_counts = df.groupby("aladdin_id")["permid"].nunique()

    # Step 2: Filter aladdin_ids with more than one unique permid
    conflicting_ids = permid_counts[permid_counts > 1].index

    # Step 3: Return all rows in df where aladdin_id is in the list
    return df[df["aladdin_id"].isin(conflicting_ids)]

In [21]:
def find_conflicting_columns(
    df: pd.DataFrame,
    id_col: str = "aladdin_id",
    conflict_col_a: str = "ovr_target",
    conflict_col_b: str = "ovr_value"
) -> pd.DataFrame:
    grouping_cols = [id_col, conflict_col_a]

    # Step 1: Count unique conflict_col_b values per group
    grouped_df = df.groupby(grouping_cols)[conflict_col_b].nunique()

    # Step 2: Filter to groups with more than one unique conflict_col_b
    conflicting_keys = grouped_df[grouped_df > 1].index

    # Step 3: Use a mask to filter original DataFrame
    mask = df.set_index(grouping_cols).index.isin(conflicting_keys)
    return df[mask].sort_values(by=grouping_cols).copy()


In [22]:
override_target = find_conflicting_permid(override)

In [None]:
override_target.head()

In [23]:
overrides_troubles = find_conflicting_columns(override)

In [None]:
overrides_troubles.head()

In [24]:
overrides_troubles.to_csv(r"C:\Users\n740789\Downloads\issuers_with_multiple_permids_per_aladdin_id_same_str_same_value.csv", index=False)

In [25]:
override_target.to_csv(r"C:\Users\n740789\Downloads\issuers_with_multiple_permids_per_aladdin_id.csv", index=False)

In [None]:
for c in ["aladdin_id", "permid", "issuer_name", "clarityid"]:
    if c in override_target.columns:
        print(f"{c} has {override_target[c].nunique()} unique values")

In [27]:
for id in override_target["aladdin_id"].unique():
    print(f"aladdin_id {id} has {override_target[override_target.aladdin_id==id]["permid"].nunique()} different permids")