In [1]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np
import json

sys.path.append(os.path.abspath(os.path.join(os.getcwd(),"..")))


In [2]:
# --------------------------------------------------------------------
# 1.  LOAD FILES
# --------------------------------------------------------------------
ROOT = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform")

issuers_aws_path = ROOT / "20250603_portoflio_bmk_positions_issuers_extract_aws.csv"
issuers_brs_path = ROOT / "20250603_snt_world_sntcor_corp_shares.xlsx"
str_map_json_path = ROOT / "20250427_esg_map_str_ptf_bmk.json"

with open(str_map_json_path, encoding="utf-8") as fh:
    str_map = json.load(fh)

# --------------------------------------------------------------------
# 2.  STRATEGY ↔ ID MAP   (long format)
# --------------------------------------------------------------------
records = [
    dict(strategy=s, kind=grp[:-1], code=cid)
    for grp in ("portfolios", "benchmarks")
    for s, lst in str_map[grp].items()
    for cid in lst
]
df_map = pd.DataFrame(records)

# --------------------------------------------------------------------
# 3.  AWS DATA
# --------------------------------------------------------------------
aws = (
    pd.read_csv(issuers_aws_path, dtype=str, low_memory=False)
      .rename(columns={"issuer_id": "issuerid"})
      .assign(issuerid=lambda d: d["issuerid"].str.strip())
      .replace({"": pd.NA})
      .dropna(subset=["issuerid"])
)

aws_ptf_ids = set(aws.loc[aws["portfolio_ids"].notna(),  "issuerid"])
aws_bmk_ids = set(aws.loc[aws["benchmark_ids"].notna(), "issuerid"])

# --------------------------------------------------------------------
# 4.  BRS DATA
# --------------------------------------------------------------------
brs_raw = {
    "portfolio": pd.read_excel(issuers_brs_path, sheet_name="portfolio_positions",
                               skiprows=3, dtype=str),
    "benchmark": pd.read_excel(issuers_brs_path, sheet_name="benchmarks",
                               skiprows=3, dtype=str),
}

# normalise columns ONCE, then rename aladdinid → issuerid
for kind, df in brs_raw.items():
    df.columns = (
        df.columns.str.lower()
                   .str.replace(r"[ \-]", "_", regex=True)
    )
    if "aladdin_id" in df.columns:           # header is 'aladdin_id' BEFORE clean
        df.rename(columns={"aladdin_id": "issuerid"}, inplace=True)
    df["issuerid"] = df["issuerid"].str.strip()
    df.replace({"": pd.NA}, inplace=True)
    df.dropna(subset=["issuerid"], inplace=True)

# keep only SNTCore rows + needed cols
brs = {
    "portfolio": (
        brs_raw["portfolio"]
          .loc[brs_raw["portfolio"].sntcore_share_corps_flag.eq("TRUE"),
               ["issuer_name", "issuerid", "portfolio_id"]]
    ),
    "benchmark": (
        brs_raw["benchmark"]
          .loc[brs_raw["benchmark"].sntcore_share_corps_flag.eq("TRUE"),
               ["issuer_name", "issuerid", "benchmark_id"]]
    ),
}

ptf_allowed = set(df_map.query("kind == 'portfolio'")["code"])
bmk_allowed = set(df_map.query("kind == 'benchmark'")["code"])

brs_positions = {
    "portfolio": brs["portfolio"].loc[brs["portfolio"].portfolio_id.isin(ptf_allowed)],
    "benchmark": brs["benchmark"].loc[brs["benchmark"].benchmark_id.isin(bmk_allowed)],
}

# --------------------------------------------------------------------
# 5.  INTERSECTIONS & DIFFERENCES
# --------------------------------------------------------------------
def diff_and_common(brs_df, aws_ids):
    brs_ids = set(brs_df["issuerid"])
    return brs_ids & aws_ids, aws_ids - brs_ids, brs_ids - aws_ids

common_ptf, aws_only_ptf, brs_only_ptf = diff_and_common(brs_positions["portfolio"], aws_ptf_ids)
common_bmk, aws_only_bmk, brs_only_bmk = diff_and_common(brs_positions["benchmark"], aws_bmk_ids)

all_brs_ids = set().union(*(df["issuerid"] for df in brs_positions.values()))
all_aws_ids = aws_ptf_ids | aws_bmk_ids
aws_missing_everywhere = all_aws_ids - all_brs_ids

# --------------------------------------------------------------------
# 6.  REPORT
# --------------------------------------------------------------------
def report(label, n):
    print(f"{label:<35}{n:>8,}")

print("─" * 55)
report("Unique issuerids in BRS PTF (filtered):",  brs_positions["portfolio"]["issuerid"].nunique())
report("Unique issuerids in BRS BMK (filtered):",  brs_positions["benchmark"]["issuerid"].nunique())
report("Unique issuerids in BRS PTF (raw):",       brs_raw["portfolio"]["issuerid"].nunique())
report("Unique issuerids in BRS BMK (raw):",       brs_raw["benchmark"]["issuerid"].nunique())
report("Unique issuerids in AWS PTF:",             len(aws_ptf_ids))
report("Unique issuerids in AWS BMK:",             len(aws_bmk_ids))
print("─" * 55)
report("Common PTF ids:",                          len(common_ptf))
report("Common BMK ids:",                          len(common_bmk))
report("Only in AWS PTF:",                         len(aws_only_ptf))
report("Only in AWS BMK:",                         len(aws_only_bmk))
report("Only in BRS PTF:",                         len(brs_only_ptf))
report("Only in BRS BMK:",                         len(brs_only_bmk))
report("AWS ids missing from all BRS data:",       len(aws_missing_everywhere))
print("─" * 55)


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


───────────────────────────────────────────────────────
Unique issuerids in BRS PTF (filtered):     808
Unique issuerids in BRS BMK (filtered):   1,152
Unique issuerids in BRS PTF (raw):    2,950
Unique issuerids in BRS BMK (raw):   13,503
Unique issuerids in AWS PTF:          1,129
Unique issuerids in AWS BMK:          6,760
───────────────────────────────────────────────────────
Common PTF ids:                         800
Common BMK ids:                       1,152
Only in AWS PTF:                        329
Only in AWS BMK:                      5,608
Only in BRS PTF:                          8
Only in BRS BMK:                          0
AWS ids missing from all BRS data:    5,690
───────────────────────────────────────────────────────


In [2]:
issuers_aws_path = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250603_portoflio_bmk_positions_issuers_extract_aws.csv")
issuers_brs_path = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250603_snt_world_sntcor_corp_shares.xlsx")
str_map_json_path = Path(r"C:\Users\n740789\Downloads\text_issuers_brs_dataplatform\20250427_esg_map_str_ptf_bmk.json")

In [3]:
# read json in str_map_json_path
with open(str_map_json_path, 'r', encoding='utf-8') as f:
    str_map = json.load(f)

# let's convert the json to a DataFrame in long format
records = []
for group in ("portfolios", "benchmarks"):          # top-level keys
    for strat, ids in str_map[group].items():       # inner dict
        for id_ in ids:                             # each code
            records.append(
                {"strategy": strat,
                 "kind":     group[:-1],            # → 'portfolio' | 'benchmark'
                 "code":     id_}
            )

df_map_str_bmk_ptf_id = pd.DataFrame(records)

In [None]:
issuers_aws = pd.read_csv(issuers_aws_path, dtype=str, low_memory=False)
issuers_brs_bmk = pd.read_excel(issuers_brs_path, sheet_name="benchmarks", skiprows=3, dtype=str)
issuers_brs_ptf = pd.read_excel(issuers_brs_path, sheet_name="portfolio_positions", skiprows=3, dtype=str)

# drop rows with empty issuerid
issuers_aws.rename(columns={"issuer_id": "issuerid"}, inplace=True)
issuers_aws = issuers_aws[issuers_aws["issuerid"].notna()]
num_unique_issuer_aws_ptf = issuers_aws[issuers_aws["portfolio_ids"].notna()]["issuerid"].nunique()
num_unique_issuer_aws_bmk = issuers_aws[issuers_aws["benchmark_ids"].notna()]["issuerid"].nunique()
issuerid_unique_id_list_aws_ptf = issuers_aws[issuers_aws["portfolio_ids"].notna()]["issuerid"].unique().tolist()
issuerid_unique_id_list_aws_bmk = issuers_aws[issuers_aws["benchmark_ids"].notna()]["issuerid"].unique().tolist()

In [None]:
brs_df_list = [issuers_brs_ptf, issuers_brs_bmk]
# normalize df's columns and data
for df in brs_df_list:
    df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
    if "aladdin_id" in df.columns:
        # rename issuerid to issuerid
        df.rename(columns={"aladdin_id": "issuerid"}, inplace=True)
        # drop rows with empty issuerid
        df.dropna(subset=["issuerid"], inplace=True)
        # convert issuerid to string
        df["issuerid"] = df["issuerid"].astype(str)


issuers_brs_bmk_clean = issuers_brs_bmk[issuers_brs_bmk.sntcore_share_corps_flag == 'TRUE'][["issuer_name", "issuerid", "benchmark_id"]].copy()
issuers_brs_ptf_clean = issuers_brs_ptf[issuers_brs_ptf.sntcore_share_corps_flag == 'TRUE'][["issuer_name", "issuerid", "portfolio_id"]].copy()

esg_ptf_id_list = df_map_str_bmk_ptf_id[df_map_str_bmk_ptf_id["kind"]=="portfolio"]["code"].tolist()
esg_bmk_id_list = df_map_str_bmk_ptf_id[df_map_str_bmk_ptf_id["kind"]=="benchmark"]["code"].tolist()

brs_positions_ptf = issuers_brs_ptf_clean[issuers_brs_ptf_clean.portfolio_id.isin(esg_ptf_id_list)].copy()
brs_positions_bmk = issuers_brs_bmk_clean[issuers_brs_bmk_clean.benchmark_id.isin(esg_bmk_id_list)].copy()


brs_positions_df_list = [
    brs_positions_ptf,
    brs_positions_bmk,
]

for df in brs_positions_df_list:
    # print unique number values in issuerid
    num_unique_issuerids = df.issuerid.nunique()
    print(f"Number of unique issuerids in BRS DataFrame after filtering: {num_unique_issuerids}")

print("\n")

for df in brs_df_list:
    # print unique number values in issuerid
    num_unique_issuerids = df.issuerid.nunique()
    print(f"Number of unique issuerids in BRS DataFrame before filtering: {num_unique_issuerids}")

print("\n")

print(f"Number of unique issuers in AWS portfolio: {num_unique_issuer_aws_ptf}")
print(f"Number of unique issuers in AWS benchmark: {num_unique_issuer_aws_bmk}")

In [None]:
# save in a list issuerid in brs_positions_ptf that are in issuerid_unique_id_list_aws_ptf
common_issuerid_in_brs_positions_ptf_in_aws = brs_positions_ptf[brs_positions_ptf["issuerid"].isin(issuerid_unique_id_list_aws_ptf)]["issuerid"].unique().tolist() 

# save in a list issuerid in brs_positions_bmk that are in issuerid_unique_id_list_aws_bmk
common_issuerid_in_brs_positions_bmk_in_aws = brs_positions_bmk[brs_positions_bmk["issuerid"].isin(issuerid_unique_id_list_aws_bmk)]["issuerid"].unique().tolist()

# save in a list issuerid in brs_positions_ptf that are not in issuerid_unique_id_list_aws_bmk
missing_issuerid_in_brs_positions_ptf = brs_positions_ptf[~brs_positions_ptf["issuerid"].isin(issuerid_unique_id_list_aws_ptf)]["issuerid"].unique().tolist() 

# save in a list issuerid in brs_positions_bmk that are not in issuerid_unique_id_list_aws_bmk
missing_issuerid_in_brs_positions_bmk = brs_positions_bmk[~brs_positions_bmk["issuerid"].isin(issuerid_unique_id_list_aws_bmk)]["issuerid"].unique().tolist()

# save in a list issuerids that are in issuerid_unique_id_list_aws_ptf but not in brs_positions_ptf
missing_issuerid_in_brs_positions_ptf_in_aws = list(set(issuerid_unique_id_list_aws_ptf) - set(common_issuerid_in_brs_positions_ptf_in_aws))
# save in a list issuerids that are in issuerid_unique_id_list_aws_bmk but not in brs_positions_bmk
missing_issuerid_in_brs_positions_bmk_in_aws = list(set(issuerid_unique_id_list_aws_bmk) - set(common_issuerid_in_brs_positions_bmk_in_aws))

# save in a list issuerids that are neither in brs_positions_ptf nor in issuerid_unique_id_list_aws_ptf
issuerid_unique_id_list_aws = list(set(issuerid_unique_id_list_aws_ptf + issuerid_unique_id_list_aws_bmk))
missing_issuerid_in_brs_positions_ptf_in_aws = list(set(missing_issuerid_in_brs_positions_ptf) - set(issuerid_unique_id_list_aws_ptf))
brs_positions_ptf_ids_list = brs_positions_ptf["issuerid"].unique().tolist()
brs_positions_bmk_ids_list = brs_positions_bmk["issuerid"].unique().tolist()
brs_positions_ids_list = list(set(brs_positions_ptf_ids_list + brs_positions_bmk_ids_list))

missing_brs_positions_ids_list = list(set(issuerid_unique_id_list_aws) - set(brs_positions_ids_list))

# print len of every list
print(f"Number of common issuerids in BRS portfolio positions and AWS portfolio: {len(common_issuerid_in_brs_positions_ptf_in_aws)}")
print(f"Number of common issuerids in BRS benchmark positions and AWS benchmark: {len(common_issuerid_in_brs_positions_bmk_in_aws)}")
print(f"Number of missing issuerids in BRS portfolio positions: {len(missing_issuerid_in_brs_positions_ptf)}")
print(f"Number of missing issuerids in BRS benchmark positions: {len(missing_issuerid_in_brs_positions_bmk)}")
print(f"Number of missing issuerids in BRS portfolio positions in AWS: {len(missing_issuerid_in_brs_positions_ptf_in_aws)}")
print(f"Number of missing issuerids in BRS benchmark positions in AWS: {len(missing_issuerid_in_brs_positions_bmk_in_aws)}")