In [1]:
import sys
from pathlib import Path
from datetime import datetime

# Get the parent directory of current notebook dir, which is the repo root
repo_root = Path.cwd().parent
sys.path.insert(0, str(repo_root))

# Verify the path is correct:
print(f"Added to sys.path: {repo_root}")

import pandas as pd


# Now import your module
from scripts.utils.config import get_config

Added to sys.path: c:\Users\n740789\Documents\clarity_data_quality_controls


In [2]:
config = get_config("explore-sustainalytics-nasdaq-meetrics", auto_date=False, fixed_date="202505")
logger = config["logger"]
sustain_dir = config["SUSTAINALYTICS_DATA_DIR"]
nasdaq_dir = config["NASDAQ_DATA_DIR"]
sutainalytics_path = config["paths"]["SUSTAINALYTICS_DATA_PATH"]
nasdaq_path = config["paths"]["NASDAQ_DATA_PATH"]

In [3]:
out_nasdaq_csv_file_path = nasdaq_dir / "202505_esg_nasdad_flag.csv"
out_sustain_csv_file_path = sustain_dir / "202505_esg_sustain_flag.csv"

In [4]:
def clean_cols(col_name:str)->str:
    """
    Cleans the column names by removing unwanted characters and spaces.
    """
    # Remove unwanted characters and spaces
    col_name = col_name.replace("(Wt Avg-PORT Delta NMV)","").replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_").lower()
    return col_name

def clean_df(df: pd.DataFrame, target_cols: list[str], how: str = "all") -> pd.DataFrame:
    """
    Drop rows based on NaN values in target columns.

    Parameters:
    - df: The input DataFrame.
    - target_cols: List of column names to check.
    - how: "all" (drop rows if *all* target columns are NaN),
           "any" (drop rows if *any* target column is NaN).
    """
    if how not in {"all", "any"}:
        raise ValueError("Parameter 'how' must be either 'all' or 'any'")

    # Create the mask
    if how == "all":
        mask = df[target_cols].isna().all(axis=1)
    else:  # how == "any"
        mask = df[target_cols].isna().any(axis=1)

    return df[~mask]

In [5]:
nasdaq = pd.read_excel(nasdaq_path, skiprows=3)
sutainalytics = pd.read_excel(sutainalytics_path, skiprows=3)

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [6]:
nasdaq.columns = [clean_cols(col) for col in nasdaq.columns]
sutainalytics.columns = [clean_cols(col) for col in sutainalytics.columns]
nasdaq_clean = clean_df(nasdaq, ["cusip", "nasdaq_bnd"], how="any")
sutainalytics_clean = clean_df(sutainalytics, ["issuer_id", "security_description"], how="any")
sutainalytics_clean = clean_df(sutainalytics_clean, ["highest_controversy_level_answer_category", "overall_global_compact_compliance_status"], how="all")

In [7]:
# save cleaned df into csv files
#nasdaq_clean.to_csv(out_nasdaq_csv_file_path, index=False)
#sutainalytics_clean.to_csv(out_sustain_csv_file_path, index=False)

In [8]:
sutainalytics_clean.head()

Unnamed: 0,issuer_id,security_description,highest_controversy_level_answer_category,overall_global_compact_compliance_status
174,R87118,BANCO SANTANDER MEXICO SA INSTITUC,3.0,Compliant
184,007699,BANCO SANTANDER SA,3.0,Compliant
185,007699,BANCO SANTANDER SA,3.0,Compliant
186,007699,BANCO SANTANDER SA,3.0,Compliant
187,007699,BANCO SANTANDER SA,3.0,Compliant


In [9]:
nasdaq_clean.head()

Unnamed: 0,cusip,issuer_id,security_description,nasdaq_bnd
1591,BDE0J4TT3,R65824,KUNTARAHOITUS OYJ MTN RegS,Green Bond
1593,BDE0K2X29,R62038,KFW MTN RegS,Green Bond
1609,BDE0WDPV7,95753A,NRW BANK MTN RegS,Green Bond
1614,BDE0ZLN71,R62035,BASQUE AUTONOMOUS COMMUNITY OF RegS,Sust. Bond
1616,BDE1098Z9,833653,SOCIETE NATIONALE SNCF SA RegS,Green Bond


In [10]:
nasdaq_clean.nasdaq_bnd.unique()

array(['Green Bond', 'Sust. Bond', 'Social Bond', 'Linked Bond'],
      dtype=object)

In [11]:
sutainalytics_clean.overall_global_compact_compliance_status.unique()

array(['Compliant', 'Watchlist', 'Non-Compliant', nan], dtype=object)

In [17]:
# print columns datatype
for name, df in zip(["esg_fact_security_nasdaq", "esg_fact_issuer_sutainalytics"],[nasdaq_clean, sutainalytics_clean]):
    print(f"THIS IS A DATAFRAME NAME {name} and below its columns")
    for c in df.columns:
        print(f"'{c}': {df[c].dtype},")
    print("\n\n\n")

THIS IS A DATAFRAME NAME esg_fact_security_nasdaq and below its columns
'cusip': object,
'issuer_id': object,
'security_description': object,
'nasdaq_bnd': object,




THIS IS A DATAFRAME NAME esg_fact_issuer_sutainalytics and below its columns
'issuer_id': object,
'security_description': object,
'highest_controversy_level_answer_category': object,
'overall_global_compact_compliance_status': object,






In [16]:
# convert sutainalytics_clean["highest_controversy_level_answer_category"] to int
sutainalytics_clean["highest_controversy_level_answer_category"] = sutainalytics_clean["highest_controversy_level_answer_category"].apply(lambda x : int(x) if pd.notna(x) else pd.NA).copy()