In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
target_columns = [
    "isin",
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
    ]

In [5]:
path_df = Path(r"C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\raw_dataset\2025\20250401_Production\20250401_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv")

df = pd.read_csv(path_df)

  df = pd.read_csv(path_df)


In [8]:
# check column names containing clarity in df
print(df.columns[df.columns.str.contains("isin", case=False)])

Index(['isin'], dtype='object')


In [3]:
def read_file(path:str, target_columns:list=target_columns) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        usecols=target_columns,
        dtype={"isin":str}
    )
    df.columns = df.columns.str.lower()
    target_columns_lower = [col.lower() for col in target_columns]
    df = df[target_columns_lower]
    return df

In [4]:
df_inherit = read_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\inheritance_analysis\20250215_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv")

In [5]:
df = read_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\inheritance_analysis\20250301_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv")

In [6]:
final = pd.merge(df, df_inherit, how="left", on=["isin"], suffixes=("_oihr","_nihr"))

In [12]:
# save first 50 rows
final.head(50).to_csv(r"C:\Users\n740789\Downloads\inheritance_analysis_security_50.csv", index=False)

In [8]:
def calculate_strategy_impact(df, strategies):
    """
    Calculate the percentage of transitions between states for a list of strategies.

    Parameters:
        df (pd.DataFrame): The dataframe containing twin columns for each strategy.
                           For each strategy, the old value column should be named <strategy>_oihr
                           and the new value column should be named <strategy>_nihr.
        strategies (list): List of strategy base names (without suffixes).

    Returns:
        pd.DataFrame: A DataFrame with one row per strategy and columns for each impact transition:
                      'OK_to_EXCLUDED', 'OK_to_FLAG', 'EXCLUDED_to_OK', 'EXCLUDED_to_FLAG'
    """
    
    results = []
    
    for strat in strategies:
        old_col = f"{strat}_oihr"
        new_col = f"{strat}_nihr"
        
        # Initialize a dict for this strategy
        impact = {"strategy": strat}
        
        # ----- Transitions from OK -----
        # Filter rows with "OK" in the old column.
        df_ok = df[df[old_col] == "OK"]
        total_ok = len(df_ok)
        
        if total_ok > 0:
            count_ok_to_excluded = (df_ok[new_col] == "EXCLUDED").sum()
            count_ok_to_flag = (df_ok[new_col] == "FLAG").sum()
            impact["OK_to_EXCLUDED"] = (count_ok_to_excluded / total_ok) * 100
            impact["OK_to_FLAG"] = (count_ok_to_flag / total_ok) * 100
        else:
            impact["OK_to_EXCLUDED"] = None
            impact["OK_to_FLAG"] = None
        
        # ----- Transitions from EXCLUDED -----
        # Filter rows with "EXCLUDED" in the old column.
        df_excluded = df[df[old_col] == "EXCLUDED"]
        total_excluded = len(df_excluded)
        
        if total_excluded > 0:
            count_excluded_to_ok = (df_excluded[new_col] == "OK").sum()
            count_excluded_to_flag = (df_excluded[new_col] == "FLAG").sum()
            impact["EXCLUDED_to_OK"] = (count_excluded_to_ok / total_excluded) * 100
            impact["EXCLUDED_to_FLAG"] = (count_excluded_to_flag / total_excluded) * 100
        else:
            impact["EXCLUDED_to_OK"] = None
            impact["EXCLUDED_to_FLAG"] = None

        # ----- Transitions from FLAG -----
        # Filter rows with "FLAG" in the old column.
        df_flag = df[df[old_col] == "FLAG"]
        total_flag = len(df_flag)

        if total_flag > 0:
            count_flag_to_ok = (df_flag[new_col] == "OK").sum()
            count_flag_to_excluded = (df_flag[new_col] == "EXCLUDED").sum()
            impact["FLAG_to_OK"] = (count_flag_to_ok / total_flag) * 100
            impact["FLAG_to_EXCLUDED"] = (count_flag_to_excluded / total_flag) * 100
        else:
            impact["FLAG_to_OK"] = None
            impact["FLAG_to_EXCLUDED"] = None
        
        results.append(impact)

        # if col num the round to 3 decimal places
        results = [{k: round(v,3) if isinstance(v, float) else v for k, v in d.items()} for d in results]
        # if col num the format to percentage
        results = [{k: f"{v:.2f}%" if isinstance(v, float) else v for k, v in d.items()} for d in results]
    
    return pd.DataFrame(results)


In [9]:
strategies = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
]

In [10]:
impact_df = calculate_strategy_impact(final, strategies)


In [11]:
impact_df

Unnamed: 0,strategy,OK_to_EXCLUDED,OK_to_FLAG,EXCLUDED_to_OK,EXCLUDED_to_FLAG
0,str_001_s,1.606228,0.0,0.0,0.0
1,str_002_ec,1.612586,0.0,0.0,0.0
2,str_003_ec,1.534986,0.0,0.0,0.0
3,str_003b_ec,1.529578,0.0,0.0,0.0
4,str_004_asec,2.185751,0.0,0.0,0.0
5,str_005_ec,1.744707,0.0,0.0,0.0
6,cs_001_sec,0.356235,0.0,0.0,0.0
7,cs_003_sec,1.988005,0.0,0.0,0.0
8,cs_002_ec,1.923016,0.0,0.0,0.0
9,str_006_sec,1.833057,0.0,0.0,0.0
