In [1]:
import sys
import os
from pathlib import Path

from datetime import datetime

import pandas as pd
import numpy as np
import json

sys.path.append(os.path.abspath(os.path.join(os.getcwd(),"..")))


In [2]:
cross_oneoff_20250530 = pd.read_excel(r"C:\Users\n740789\Downloads\202050529_crossreference_brs_oneoff.xlsx")
cross_oneoff_20250530.rename(columns={"issuer":"issuerid", "name":"issuer_name", "PERMID":"permid"}, inplace=True)
cross_oneoff_20250530.loc[:, "permid"] = cross_oneoff_20250530["permid"].astype(str).str.strip()
cross_oneoff_20250530.info()
cross_brs = cross_oneoff_20250530[["issuerid", "permid","issuer_name"]].copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459150 entries, 0 to 459149
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   issuerid     459150 non-null  object 
 1   issuer_name  459150 non-null  object 
 2   Clarity      133018 non-null  float64
 3   MSCI         64652 non-null   object 
 4   SUST         23341 non-null   float64
 5   permid       459150 non-null  object 
dtypes: float64(2), object(4)
memory usage: 21.0+ MB


In [8]:
DATE_STAMP = datetime.now().strftime("%Y%m%d")

# --------------------------------------------------------------------
# 1.  LOAD FILES
# --------------------------------------------------------------------
ROOT = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform")

issuers_aws_path = ROOT / f"{DATE_STAMP}_portoflio_bmk_positions_issuers_extract_aws.csv"
issuers_brs_path = ROOT / f"{DATE_STAMP}_snt_world_sntcor_corp_shares.xlsx"
str_map_json_path = ROOT / "20250604_esg_map_str_ptf_bmk.json"

with open(str_map_json_path, encoding="utf-8") as fh:
    str_map = json.load(fh)

# --------------------------------------------------------------------
# 2.  STRATEGY ↔ ID MAP   (long format)
# --------------------------------------------------------------------
records = [
    dict(strategy=st["strategy_id"],    # str_001_s …
         kind=kind,                    # 'portfolio' | 'benchmark'
         code=cid)                     # every single id
    for st in str_map["strategies"]                      # ← iterate the list
    for kind, ids in (("portfolio",  st["portfolio_ids"]),   # grab the two lists
                      ("benchmark", st["benchmark_ids"]))
    for cid in ids                                         # explode each list
]

df_map = pd.DataFrame(records)

# --------------------------------------------------------------------
# 3.  AWS DATA
# --------------------------------------------------------------------
aws = (
    pd.read_csv(issuers_aws_path, dtype=str, low_memory=False)
      .assign(issuer_id=lambda d: d["issuer_id"].str.strip())
      .replace({"": pd.NA})
      .dropna(subset=["issuer_id"])
)

aws_ptf_ids = set(aws.loc[aws["portfolio_ids"].notna(),  "issuer_id"])
aws_bmk_ids = set(aws.loc[aws["benchmark_ids"].notna(), "issuer_id"])

# --------------------------------------------------------------------
# 4.  BRS DATA
# --------------------------------------------------------------------
brs_raw = {
    "portfolio": pd.read_excel(issuers_brs_path, sheet_name="ptf",
                               skiprows=3, dtype=str),
    "benchmark": pd.read_excel(issuers_brs_path, sheet_name="bmk",
                               skiprows=3, dtype=str),
}

# normalise columns ONCE, then rename aladdinid → issuer_id
for kind, df in brs_raw.items():
    df.columns = (
        df.columns.str.lower()
                   .str.replace(r"[ \-]", "_", regex=True)
    )
    df["issuer_id"] = df["issuer_id"].str.strip()
    df.replace({"": pd.NA}, inplace=True)
    df.dropna(subset=["issuer_id"], inplace=True)

# keep only SNTCore rows + needed cols
brs = {
    "portfolio": (
        brs_raw["portfolio"]
          .loc[brs_raw["portfolio"].sntcore_share_corps_flag.eq("TRUE"),
               ["issuer_name", "issuer_id", "portfolio_id"]]
    ),
    "benchmark": (
        brs_raw["benchmark"]
          .loc[brs_raw["benchmark"].sntcore_share_corps_flag.eq("TRUE"),
               ["issuer_name", "issuer_id", "benchmark_id"]]
    ),
}

ptf_allowed = set(df_map.query("kind == 'portfolio'")["code"])
bmk_allowed = set(df_map.query("kind == 'benchmark'")["code"])

brs_positions = {
    "portfolio": brs["portfolio"].loc[brs["portfolio"].portfolio_id.isin(ptf_allowed)],
    "benchmark": brs["benchmark"].loc[brs["benchmark"].benchmark_id.isin(bmk_allowed)],
}

# --------------------------------------------------------------------
# 5.  INTERSECTIONS & DIFFERENCES
# --------------------------------------------------------------------
def diff_and_common(brs_df, aws_ids):
    brs_ids = set(brs_df["issuer_id"])
    return brs_ids & aws_ids, aws_ids - brs_ids, brs_ids - aws_ids

common_ptf, aws_only_ptf, brs_only_ptf = diff_and_common(brs_positions["portfolio"], aws_ptf_ids)
common_bmk, aws_only_bmk, brs_only_bmk = diff_and_common(brs_positions["benchmark"], aws_bmk_ids)

all_brs_ids = set().union(*(df["issuer_id"] for df in brs_positions.values()))
all_aws_ids = aws_ptf_ids | aws_bmk_ids
aws_missing_everywhere = all_aws_ids - all_brs_ids

# --------------------------------------------------------------------
# 6.  REPORT
# --------------------------------------------------------------------
def report(label, n):
    print(f"{label:<35}{n:>8,}")

print("─" * 55)
report("Unique issuer_ids in BRS PTF (filtered):",  brs_positions["portfolio"]["issuer_id"].nunique())
report("Unique issuer_ids in BRS BMK (filtered):",  brs_positions["benchmark"]["issuer_id"].nunique())
report("Unique issuer_ids in BRS PTF (raw):",       brs_raw["portfolio"]["issuer_id"].nunique())
report("Unique issuer_ids in BRS BMK (raw):",       brs_raw["benchmark"]["issuer_id"].nunique())
report("Unique issuer_ids in AWS PTF:",             len(aws_ptf_ids))
report("Unique issuer_ids in AWS BMK:",             len(aws_bmk_ids))
print("─" * 55)
report("Common PTF ids:",                          len(common_ptf))
report("Common BMK ids:",                          len(common_bmk))
report("Only in AWS PTF:",                         len(aws_only_ptf))
report("Only in AWS BMK:",                         len(aws_only_bmk))
report("Only in BRS PTF:",                         len(brs_only_ptf))
report("Only in BRS BMK:",                         len(brs_only_bmk))
report("AWS ids missing from all BRS data:",       len(aws_missing_everywhere))
print("─" * 55)

# --------------------------------------------------------------------
# 7.  SAVE DIFFERENCE SETS TO CSV
# --------------------------------------------------------------------
# (a) rows that exist in AWS but **not** in BRS
aws_only_ids = aws_only_ptf | aws_only_bmk          # union of the two sets
df_aws_not_in_brs = (
    aws.loc[aws["issuer_id"].isin(aws_only_ids)]
       .drop_duplicates(subset=["issuer_id"])
       .sort_values("issuer_id")
)
df_aws_not_in_brs.to_csv(ROOT / f"{DATE_STAMP}_aws_issuer_ids_not_in_brs.csv", index=False)

# (b) rows that exist in BRS but **not** in AWS
brs_only_ids = brs_only_ptf | brs_only_bmk          # union of the two sets
df_brs_not_in_aws = (
    pd.concat(brs_positions.values(), ignore_index=True)
      .loc[lambda d: d["issuer_id"].isin(brs_only_ids)]
      .drop_duplicates(subset=["issuer_id"])
      .sort_values("issuer_id")
)
df_brs_not_in_aws.to_csv(ROOT / f"{DATE_STAMP}_brs_issuerids_not_in_aws.csv", index=False)

print("CSV files written:")
print(" •", ROOT / f"{DATE_STAMP}_aws_issuerids_not_in_brs.csv")
print(" •", ROOT / f"{DATE_STAMP}_brs_issuerids_not_in_aws.csv")

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


───────────────────────────────────────────────────────
Unique issuer_ids in BRS PTF (filtered):     806
Unique issuer_ids in BRS BMK (filtered):   1,152
Unique issuer_ids in BRS PTF (raw):   2,930
Unique issuer_ids in BRS BMK (raw):  13,503
Unique issuer_ids in AWS PTF:         1,128
Unique issuer_ids in AWS BMK:         7,736
───────────────────────────────────────────────────────
Common PTF ids:                         799
Common BMK ids:                       1,152
Only in AWS PTF:                        329
Only in AWS BMK:                      6,584
Only in BRS PTF:                          7
Only in BRS BMK:                          0
AWS ids missing from all BRS data:    6,666
───────────────────────────────────────────────────────
CSV files written:
 • C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250604_aws_issuerids_not_in_brs.csv
 • C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250604_brs_issuerids_not_in_aws.csv


In [9]:
df_aws_not_in_brs.head()

Unnamed: 0,issuer_id,portfolio_ids,benchmark_ids
1,000375,,"[FIFSANEUR, MLGVSTEUR, SHELLSP43, SHELLSP27, M..."
2,00038A,,"[MLSPEEUR, SHELLSP04, MLDGMSEUR, MLGVSTEUR, FI..."
3,001625,"[PFS00358, FPG00024, FPG00008, FIG02787, FPG00...","[MLECOGVMS, IBMLSPEUR, ESIBSNPER, SHELLSP39, M..."
4,002800,[PFIT0011],"[COMP_8, SHELLPT11, SHELLSP43, MSMLCIEUR, SHEL..."
6,002824,,"[SHELLSP27, MLGVSTEUR, MLEMSCESN, COMP_8, SHEL..."


In [10]:
df_brs_not_in_aws.head(10)

Unnamed: 0,issuer_name,issuer_id,portfolio_id,benchmark_id
2234,NATURGY ENERGY GROUP SA,B74189,CPE03744,
2499,ATRESMEDIA CORPORACION DE MEDIOS DE COMUNICACI...,B98521,CPE03744,
2774,WACKER CHEMIE AG,C73441,CPE03744,
5582,BRENNTAG SE,R21934,CPE03744,
8717,CORPORACION ALIMENTARIA IBERICA SA,R76459,CPE03744,
8801,AMPER SA,R82983,CPE03744,
8809,CORPORACION EMPRESARIAL DE MATERIALES DE CONST...,R83166,CPE03744,


In [11]:
df_brs_not_in_aws.shape

(7, 4)

In [12]:
df_aws_not_in_brs.shape

(6847, 3)

In [2]:
issuers_aws_path = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250603_portoflio_bmk_positions_issuers_extract_aws.csv")
issuers_brs_path = Path(r"C:\Users\n740789\Documents\esg-sri-repos\clarity_data_quality_controls\excel_books\aladdin_data\snt_word_issuer_data\20250604_snt_world_sntcor_corp_shares.xlsx")
str_map_json_path = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250427_esg_map_str_ptf_bmk.json")

In [None]:
issuers_brs_bmk = pd.read_excel(issuers_brs_path, sheet_name="bmk", skiprows=3, dtype=str)
issuers_brs_ptf = pd.read_excel(issuers_brs_path, sheet_name="ptf", skiprows=3, dtype=str)

In [None]:
issuers_brs_bmk.head()

In [None]:
issuers_brs_ptf.head()

In [None]:
print(issuers_brs_bmk[issuers_brs_bmk.sntcore_share_corps_flag == "TRUE"]["sntcore_share_corps_cat"].unique())
print(issuers_brs_ptf[issuers_brs_ptf.sntcore_share_corps_flag == "TRUE"]["sntcore_share_corps_cat"].unique())

In [None]:
print(issuers_brs_bmk[issuers_brs_bmk.sntcore_share_corps_flag != "TRUE"]["sntcore_share_corps_cat"].unique())
print(issuers_brs_ptf[issuers_brs_ptf.sntcore_share_corps_flag != "TRUE"]["sntcore_share_corps_cat"].unique())

In [3]:
# read json in str_map_json_path
with open(str_map_json_path, 'r', encoding='utf-8') as f:
    str_map = json.load(f)

# let's convert the json to a DataFrame in long format
records = []
for group in ("portfolios", "benchmarks"):          # top-level keys
    for strat, ids in str_map[group].items():       # inner dict
        for id_ in ids:                             # each code
            records.append(
                {"strategy": strat,
                 "kind":     group[:-1],            # → 'portfolio' | 'benchmark'
                 "code":     id_}
            )

df_map_str_bmk_ptf_id = pd.DataFrame(records)

In [None]:
issuers_aws = pd.read_csv(issuers_aws_path, dtype=str, low_memory=False)
issuers_brs_bmk = pd.read_excel(issuers_brs_path, sheet_name="benchmarks", skiprows=3, dtype=str)
issuers_brs_ptf = pd.read_excel(issuers_brs_path, sheet_name="portfolio_positions", skiprows=3, dtype=str)

# drop rows with empty issuerid
issuers_aws.rename(columns={"issuer_id": "issuerid"}, inplace=True)
issuers_aws = issuers_aws[issuers_aws["issuerid"].notna()]
num_unique_issuer_aws_ptf = issuers_aws[issuers_aws["portfolio_ids"].notna()]["issuerid"].nunique()
num_unique_issuer_aws_bmk = issuers_aws[issuers_aws["benchmark_ids"].notna()]["issuerid"].nunique()
issuerid_unique_id_list_aws_ptf = issuers_aws[issuers_aws["portfolio_ids"].notna()]["issuerid"].unique().tolist()
issuerid_unique_id_list_aws_bmk = issuers_aws[issuers_aws["benchmark_ids"].notna()]["issuerid"].unique().tolist()

In [None]:
brs_df_list = [issuers_brs_ptf, issuers_brs_bmk]
# normalize df's columns and data
for df in brs_df_list:
    df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
    if "aladdin_id" in df.columns:
        # rename issuerid to issuerid
        df.rename(columns={"aladdin_id": "issuerid"}, inplace=True)
        # drop rows with empty issuerid
        df.dropna(subset=["issuerid"], inplace=True)
        # convert issuerid to string
        df["issuerid"] = df["issuerid"].astype(str)


issuers_brs_bmk_clean = issuers_brs_bmk[issuers_brs_bmk.sntcore_share_corps_flag == 'TRUE'][["issuer_name", "issuerid", "benchmark_id"]].copy()
issuers_brs_ptf_clean = issuers_brs_ptf[issuers_brs_ptf.sntcore_share_corps_flag == 'TRUE'][["issuer_name", "issuerid", "portfolio_id"]].copy()

esg_ptf_id_list = df_map_str_bmk_ptf_id[df_map_str_bmk_ptf_id["kind"]=="portfolio"]["code"].tolist()
esg_bmk_id_list = df_map_str_bmk_ptf_id[df_map_str_bmk_ptf_id["kind"]=="benchmark"]["code"].tolist()

brs_positions_ptf = issuers_brs_ptf_clean[issuers_brs_ptf_clean.portfolio_id.isin(esg_ptf_id_list)].copy()
brs_positions_bmk = issuers_brs_bmk_clean[issuers_brs_bmk_clean.benchmark_id.isin(esg_bmk_id_list)].copy()


brs_positions_df_list = [
    brs_positions_ptf,
    brs_positions_bmk,
]

for df in brs_positions_df_list:
    # print unique number values in issuerid
    num_unique_issuerids = df.issuerid.nunique()
    print(f"Number of unique issuerids in BRS DataFrame after filtering: {num_unique_issuerids}")

print("\n")

for df in brs_df_list:
    # print unique number values in issuerid
    num_unique_issuerids = df.issuerid.nunique()
    print(f"Number of unique issuerids in BRS DataFrame before filtering: {num_unique_issuerids}")

print("\n")

print(f"Number of unique issuers in AWS portfolio: {num_unique_issuer_aws_ptf}")
print(f"Number of unique issuers in AWS benchmark: {num_unique_issuer_aws_bmk}")

In [None]:
# save in a list issuerid in brs_positions_ptf that are in issuerid_unique_id_list_aws_ptf
common_issuerid_in_brs_positions_ptf_in_aws = brs_positions_ptf[brs_positions_ptf["issuerid"].isin(issuerid_unique_id_list_aws_ptf)]["issuerid"].unique().tolist() 

# save in a list issuerid in brs_positions_bmk that are in issuerid_unique_id_list_aws_bmk
common_issuerid_in_brs_positions_bmk_in_aws = brs_positions_bmk[brs_positions_bmk["issuerid"].isin(issuerid_unique_id_list_aws_bmk)]["issuerid"].unique().tolist()

# save in a list issuerid in brs_positions_ptf that are not in issuerid_unique_id_list_aws_bmk
missing_issuerid_in_brs_positions_ptf = brs_positions_ptf[~brs_positions_ptf["issuerid"].isin(issuerid_unique_id_list_aws_ptf)]["issuerid"].unique().tolist() 

# save in a list issuerid in brs_positions_bmk that are not in issuerid_unique_id_list_aws_bmk
missing_issuerid_in_brs_positions_bmk = brs_positions_bmk[~brs_positions_bmk["issuerid"].isin(issuerid_unique_id_list_aws_bmk)]["issuerid"].unique().tolist()

# save in a list issuerids that are in issuerid_unique_id_list_aws_ptf but not in brs_positions_ptf
missing_issuerid_in_brs_positions_ptf_in_aws = list(set(issuerid_unique_id_list_aws_ptf) - set(common_issuerid_in_brs_positions_ptf_in_aws))
# save in a list issuerids that are in issuerid_unique_id_list_aws_bmk but not in brs_positions_bmk
missing_issuerid_in_brs_positions_bmk_in_aws = list(set(issuerid_unique_id_list_aws_bmk) - set(common_issuerid_in_brs_positions_bmk_in_aws))

# save in a list issuerids that are neither in brs_positions_ptf nor in issuerid_unique_id_list_aws_ptf
issuerid_unique_id_list_aws = list(set(issuerid_unique_id_list_aws_ptf + issuerid_unique_id_list_aws_bmk))
missing_issuerid_in_brs_positions_ptf_in_aws = list(set(missing_issuerid_in_brs_positions_ptf) - set(issuerid_unique_id_list_aws_ptf))
brs_positions_ptf_ids_list = brs_positions_ptf["issuerid"].unique().tolist()
brs_positions_bmk_ids_list = brs_positions_bmk["issuerid"].unique().tolist()
brs_positions_ids_list = list(set(brs_positions_ptf_ids_list + brs_positions_bmk_ids_list))

missing_brs_positions_ids_list = list(set(issuerid_unique_id_list_aws) - set(brs_positions_ids_list))

# print len of every list
print(f"Number of common issuerids in BRS portfolio positions and AWS portfolio: {len(common_issuerid_in_brs_positions_ptf_in_aws)}")
print(f"Number of common issuerids in BRS benchmark positions and AWS benchmark: {len(common_issuerid_in_brs_positions_bmk_in_aws)}")
print(f"Number of missing issuerids in BRS portfolio positions: {len(missing_issuerid_in_brs_positions_ptf)}")
print(f"Number of missing issuerids in BRS benchmark positions: {len(missing_issuerid_in_brs_positions_bmk)}")
print(f"Number of missing issuerids in BRS portfolio positions in AWS: {len(missing_issuerid_in_brs_positions_ptf_in_aws)}")
print(f"Number of missing issuerids in BRS benchmark positions in AWS: {len(missing_issuerid_in_brs_positions_bmk_in_aws)}")