In [1]:
import pandas as pd
import numpy as np

import sys
import os


# Add the project root (one level up from notebooks/) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from scripts.utils.dataloaders import (
    load_clarity_data,
    load_crossreference,
    load_overrides,
)
from scripts.utils.config import get_config
from scripts.utils.clarity_data_quality_control_functions import get_issuer_level_df

In [3]:
# Get the common configuration for the Pre-OVR-Analysis script.
config = get_config(
    "notebook-datafeeds-explorer", 
    interactive=False,
    auto_date= False,
    fixed_date= "202506",)


logger = config["logger"]
DATE = config["DATE"]
YEAR = config["YEAR"]
DATE_PREV = config["DATE_PREV"]
REPO_DIR = config["REPO_DIR"]
DATAFEED_DIR = config["DATAFEED_DIR"]
SRI_DATA_DIR = config["SRI_DATA_DIR"]
paths = config["paths"]
# Use the paths from config
DF_PREV_PATH = paths["PRE_DF_WOVR_PATH"]
DF_NEW_PATH = paths["CURRENT_DF_WOUTOVR_PATH"]
CROSSREFERENCE_PATH = paths["CROSSREFERENCE_PATH"]
BMK_PORTF_STR_PATH = paths["BMK_PORTF_STR_PATH"]
OVR_PATH = paths["OVR_PATH"]

In [4]:
target_permid = [
"4295869482",
"4296555278",
"4296358021",
]
target_cols = ["permid", "aladdin_id","issuer_name"]
targat_cols_brs = ["aladdin_id", "issuer_name"]
target_cols_clarity = ["permid", "issuer_name", "clarityid"]
target_cols_ovr = ["aladdin_id", "issuer_name", "clarityid", "permid", "ovr_target", "ovr_value","ovr_active"]
target_aladdin_id = [
    "000375",
    "003001",
    "007699",
    "010199",
    "055262",
    "059456",
    "072730",
    "M58534",
    "R48483",
    "F05671"]

In [5]:
override = load_overrides(OVR_PATH, target_cols=target_cols_ovr, drop_active=False)

2025-05-23 13:14:04,262 - scripts.utils.dataloaders - INFO - Loading overrides from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\sri_data\overrides\overrides_db.xlsx


In [6]:
override.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3184 entries, 0 to 3193
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   clarityid    1504 non-null   object
 1   permid       2990 non-null   object
 2   aladdin_id   3184 non-null   object
 3   issuer_name  3184 non-null   object
 4   ovr_target   3184 non-null   object
 5   ovr_value    3184 non-null   object
 6   ovr_active   3184 non-null   bool  
dtypes: bool(1), object(6)
memory usage: 177.2+ KB


In [7]:
missing_permid = override[override["permid"].isna()][["aladdin_id", "issuer_name"]].copy()

In [8]:
missing_permid.drop_duplicates(subset=["aladdin_id"], inplace=True)

In [9]:
missing_permid.sort_values(by=["issuer_name"], inplace=True)

In [10]:
missing_permid.rename(columns={"aladdin_id": "aladdin_issuer"}, inplace=True)

In [11]:
missing_permid.to_csv(r"C:\Users\n740789\Downloads\missing_permid.csv", index=False)

In [12]:
datafeed = load_clarity_data(DF_NEW_PATH, target_cols=target_cols_clarity)
datafeed.loc[:,"permid"] = datafeed["permid"].astype(str)

2025-05-23 13:14:04,850 - scripts.utils.dataloaders - INFO - Loading Clarity data from: C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\datafeeds_without_ovr\2025\20250601_df_issuer_level_without_ovr.csv
2025-05-23 13:14:05,341 - scripts.utils.dataloaders - INFO - Successfully loaded Clarity data from: C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\datafeeds_without_ovr\2025\20250601_df_issuer_level_without_ovr.csv


In [13]:
crossreference = load_crossreference(CROSSREFERENCE_PATH)

2025-05-23 13:14:05,358 - scripts.utils.dataloaders - INFO - Loading crossreference data from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250601.csv
2025-05-23 13:14:05,618 - scripts.utils.dataloaders - INFO - Cleaning columns and renaming crossreference data
2025-05-23 13:14:05,619 - scripts.utils.dataloaders - INFO - Successfully loaded crossreference from: C:\Users\n740789\Documents\clarity_data_quality_controls\excel_books\aladdin_data\crossreference\Aladdin_Clarity_Issuers_20250601.csv


In [14]:
crossreference.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135076 entries, 0 to 135075
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   aladdin_id   135076 non-null  object
 1   issuer_name  135076 non-null  object
 2   permid       131893 non-null  object
 3   msci         64839 non-null   object
 4   sust         23233 non-null   object
dtypes: object(5)
memory usage: 5.2+ MB


In [15]:
cross_filter = crossreference[crossreference["permid"].isin(target_permid)]
datafeed_filter = datafeed[datafeed["permid"].isin(target_permid)]
override_filter = override[override["permid"].isin(target_permid)]

In [16]:
cross_filter.set_index("aladdin_id").head()

Unnamed: 0_level_0,issuer_name,permid,msci,sust
aladdin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
251525,DEUTSCHE BANK AG,4295869482,IID000000002140460,1008288713.0
R59631,DEUTSCHE BANK AG (LONDON BRANCH),4296555278,,
R61248,DEUTSCHE BANK AG (NEW YORK BRANCH),4296358021,IID000000002226160,


In [17]:
datafeed_filter.set_index("permid").head()

Unnamed: 0_level_0,issuer_name,clarityid
permid,Unnamed: 1_level_1,Unnamed: 2_level_1
4296555278,Deutsche Bank Ag (London Branch),237878
4296358021,Deutsche Bank AG (New York Branch),235769
4295869482,Deutsche Bank AG,15657


In [18]:
(override_filter.sort_values(by=["aladdin_id", "ovr_target"])).to_excel(r"C:\Users\n740789\Downloads\info_target_permid_deutsche_overrides.xlsx", index=False)

In [19]:
override.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3184 entries, 0 to 3193
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   clarityid    1504 non-null   object
 1   permid       2990 non-null   object
 2   aladdin_id   3184 non-null   object
 3   issuer_name  3184 non-null   object
 4   ovr_target   3184 non-null   object
 5   ovr_value    3184 non-null   object
 6   ovr_active   3184 non-null   bool  
dtypes: bool(1), object(6)
memory usage: 177.2+ KB


In [20]:
def find_conflicting_permid(df: pd.DataFrame) -> pd.DataFrame:
    # Step 1: Count unique permids per aladdin_id
    permid_counts = df.groupby("aladdin_id")["permid"].nunique()

    # Step 2: Filter aladdin_ids with more than one unique permid
    conflicting_ids = permid_counts[permid_counts > 1].index

    # Step 3: Return all rows in df where aladdin_id is in the list
    return df[df["aladdin_id"].isin(conflicting_ids)]

In [21]:
def find_conflicting_columns(
    df: pd.DataFrame,
    id_col: str = "aladdin_id",
    conflict_col_a: str = "ovr_target",
    conflict_col_b: str = "ovr_value"
) -> pd.DataFrame:
    grouping_cols = [id_col, conflict_col_a]

    # Step 1: Count unique conflict_col_b values per group
    grouped_df = df.groupby(grouping_cols)[conflict_col_b].nunique()

    # Step 2: Filter to groups with more than one unique conflict_col_b
    conflicting_keys = grouped_df[grouped_df > 1].index

    # Step 3: Use a mask to filter original DataFrame
    mask = df.set_index(grouping_cols).index.isin(conflicting_keys)
    return df[mask].sort_values(by=grouping_cols).copy()


In [22]:
override_target = find_conflicting_permid(override)

In [28]:
override_target.head()

Unnamed: 0,clarityid,permid,aladdin_id,issuer_name,ovr_target,ovr_value,ovr_active


In [23]:
overrides_troubles = find_conflicting_columns(override)

In [29]:
overrides_troubles.head()

Unnamed: 0,clarityid,permid,aladdin_id,issuer_name,ovr_target,ovr_value,ovr_active


In [24]:
overrides_troubles.to_csv(r"C:\Users\n740789\Downloads\issuers_with_multiple_permids_per_aladdin_id_same_str_same_value.csv", index=False)

In [25]:
override_target.to_csv(r"C:\Users\n740789\Downloads\issuers_with_multiple_permids_per_aladdin_id.csv", index=False)

In [26]:
for c in ["aladdin_id", "permid", "issuer_name", "clarityid"]:
    if c in override_target.columns:
        print(f"{c} has {override_target[c].nunique()} unique values")

aladdin_id has 0 unique values
permid has 0 unique values
issuer_name has 0 unique values
clarityid has 0 unique values


In [27]:
for id in override_target["aladdin_id"].unique():
    print(f"aladdin_id {id} has {override_target[override_target.aladdin_id==id]["permid"].nunique()} different permids")