In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
target_columns = [
    "permId",
    "issuer_name",
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
    ]
target_col_ovr = [
    "permid",
    "issuer_name",
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
    ]

In [3]:
def process_file(path:str, target_columns:list=target_columns) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        usecols=target_columns,
        dtype={"permId":str, "permid":str}
    )
    df.columns = df.columns.str.lower()
    target_columns_lower = [col.lower() for col in target_columns]
    df = df[target_columns_lower]
    
    return df

In [4]:
df_inherit = process_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\inheritance_analysis\20250215_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv")

In [5]:
ovr = process_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\20250301_datafeed_with_ovr.csv", target_columns=target_col_ovr)

In [6]:
# remove duplicate on permid column
df_inherit = df_inherit.drop_duplicates(subset=["permid"])
ovr = ovr.drop_duplicates(subset=["permid"])

In [7]:
final = pd.merge(ovr, df_inherit, how="left", on=["permid","issuer_name"], suffixes=("_ovr","_nihr"))

In [8]:
# save first 50 rows
#final.head(50).to_csv(r"C:\Users\n740789\Downloads\inheritance_analusis_50.csv", index=False)

In [9]:
def calculate_strategy_impact(df, strategies):
    """
    Calculate the percentage of transitions between states for a list of strategies.

    Parameters:
        df (pd.DataFrame): The dataframe containing twin columns for each strategy.
                           For each strategy, the old value column should be named <strategy>_ovr
                           and the new value column should be named <strategy>_nihr.
        strategies (list): List of strategy base names (without suffixes).

    Returns:
        pd.DataFrame: A DataFrame with one row per strategy and columns for each impact transition:
                      'OK_to_EXCLUDED', 'OK_to_FLAG', 'EXCLUDED_to_OK', 'EXCLUDED_to_FLAG'
    """
    
    results = []
    
    for strat in strategies:
        old_col = f"{strat}_ovr"
        new_col = f"{strat}_nihr"
        
        # Initialize a dict for this strategy
        impact = {"strategy": strat}
        
        # ----- Transitions from OK -----
        # Filter rows with "OK" in the old column.
        df_ok = df[df[old_col] == "OK"]
        total_ok = len(df_ok)
        
        if total_ok > 0:
            count_ok_to_excluded = (df_ok[new_col] == "EXCLUDED").sum()
            count_ok_to_flag = (df_ok[new_col] == "FLAG").sum()
            impact["OK_to_EXCLUDED"] = (count_ok_to_excluded / total_ok) * 100
            impact["OK_to_FLAG"] = (count_ok_to_flag / total_ok) * 100
        else:
            impact["OK_to_EXCLUDED"] = None
            impact["OK_to_FLAG"] = None
        
        # ----- Transitions from EXCLUDED -----
        # Filter rows with "EXCLUDED" in the old column.
        df_excluded = df[df[old_col] == "EXCLUDED"]
        total_excluded = len(df_excluded)
        
        if total_excluded > 0:
            count_excluded_to_ok = (df_excluded[new_col] == "OK").sum()
            count_excluded_to_flag = (df_excluded[new_col] == "FLAG").sum()
            impact["EXCLUDED_to_OK"] = (count_excluded_to_ok / total_excluded) * 100
            impact["EXCLUDED_to_FLAG"] = (count_excluded_to_flag / total_excluded) * 100
        else:
            impact["EXCLUDED_to_OK"] = None
            impact["EXCLUDED_to_FLAG"] = None

        # ----- Transitions from FLAG -----
        # Filter rows with "FLAG" in the old column.
        df_flag = df[df[old_col] == "FLAG"]
        total_flag = len(df_flag)

        if total_flag > 0:
            count_flag_to_ok = (df_flag[new_col] == "OK").sum()
            count_flag_to_excluded = (df_flag[new_col] == "EXCLUDED").sum()
            impact["FLAG_to_OK"] = (count_flag_to_ok / total_flag) * 100
            impact["FLAG_to_EXCLUDED"] = (count_flag_to_excluded / total_flag) * 100
        else:
            impact["FLAG_to_OK"] = None
            impact["FLAG_to_EXCLUDED"] = None
        
        results.append(impact)

        # if col num the round to 3 decimal places
        results = [{k: round(v,3) if isinstance(v, float) else v for k, v in d.items()} for d in results]
        # if col num the format to percentage
        results = [{k: f"{v:.2f}%" if isinstance(v, float) else v for k, v in d.items()} for d in results]
    
    return pd.DataFrame(results)


In [10]:
strategies = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
]

In [11]:
impact_df = calculate_strategy_impact(final, strategies)


In [12]:
impact_df

Unnamed: 0,strategy,OK_to_EXCLUDED,OK_to_FLAG,EXCLUDED_to_OK,EXCLUDED_to_FLAG,FLAG_to_OK,FLAG_to_EXCLUDED
0,str_001_s,0.40%,0.06%,1.23%,0.09%,38.51%,6.53%
1,str_002_ec,0.33%,0.00%,27.22%,0.00%,0.00%,0.00%
2,str_003_ec,0.19%,0.03%,26.11%,1.33%,35.15%,2.38%
3,str_003b_ec,0.23%,0.00%,23.84%,0.00%,50.00%,2.38%
4,str_004_asec,1.27%,0.06%,0.84%,0.03%,38.17%,8.33%
5,str_005_ec,0.36%,0.00%,26.17%,0.00%,22.00%,0.00%
6,cs_001_sec,0.75%,0.01%,0.78%,0.01%,56.02%,7.47%
7,cs_003_sec,2.10%,0.07%,0.31%,0.01%,19.96%,28.20%
8,cs_002_ec,0.61%,0.00%,13.84%,0.04%,73.60%,5.62%
9,str_006_sec,0.65%,0.06%,1.74%,0.06%,37.93%,7.36%


In [19]:
def compare_strategies(df, strategies):
    """
    For each strategy, compute the percentage of rows labeled as 'OK', 'EXCLUDED', or 'FLAG'
    in the old (oihr) and new (nihr) columns. Also compute the difference in percentages
    between old and new.

    Returns a DataFrame with columns:
        strategy,
        OK_ovr, OK_new, OK_diff,
        EXCL_ovr, EXCL_new, EXCL_diff,
        FLAG_ovr, FLAG_new, FLAG_diff
    """
    
    total_rows = len(df)
    results = []

    for strat in strategies:
        old_col = f"{strat}_ovr"
        new_col = f"{strat}_nihr"
        
        # Count how many rows are "OK", "EXCLUDED", "FLAG" for old columns
        old_ok_count = (df[old_col] == "OK").sum()
        old_excl_count = (df[old_col] == "EXCLUDED").sum()
        old_flag_count = (df[old_col] == "FLAG").sum()

        # Count how many rows are "OK", "EXCLUDED", "FLAG" for new columns
        new_ok_count = (df[new_col] == "OK").sum()
        new_excl_count = (df[new_col] == "EXCLUDED").sum()
        new_flag_count = (df[new_col] == "FLAG").sum()

        # Convert counts to percentages
        old_ok_pct = old_ok_count / total_rows * 100
        old_excl_pct = old_excl_count / total_rows * 100
        old_flag_pct = old_flag_count / total_rows * 100

        new_ok_pct = new_ok_count / total_rows * 100
        new_excl_pct = new_excl_count / total_rows * 100
        new_flag_pct = new_flag_count / total_rows * 100

        # Differences (new - old)
        ok_diff = new_ok_pct - old_ok_pct
        excl_diff = new_excl_pct - old_excl_pct
        flag_diff = new_flag_pct - old_flag_pct

        results.append({
            "strategy": strat,
            "OK_ovr": old_ok_pct,
            "OK_new": new_ok_pct,
            "OK_diff": ok_diff,
            "EXCL_ovr": old_excl_pct,
            "EXCL_new": new_excl_pct,
            "EXCL_diff": excl_diff,
            "FLAG_ovr": old_flag_pct,
            "FLAG_new": new_flag_pct,
            "FLAG_diff": flag_diff
        })
    
    # Build the results DataFrame
    df_out = pd.DataFrame(results)
    
    # Round numeric columns to three decimals
    cols_to_round = [
        "OK_ovr", "OK_new", "OK_diff",
        "EXCL_ovr", "EXCL_new", "EXCL_diff",
        "FLAG_ovr", "FLAG_new", "FLAG_diff"
    ]
    df_out[cols_to_round] = df_out[cols_to_round].round(3)
    
    return df_out


In [20]:
result_df = compare_strategies(final, strategies)

In [21]:
result_df

Unnamed: 0,strategy,OK_ovr,OK_new,OK_diff,EXCL_ovr,EXCL_new,EXCL_diff,FLAG_ovr,FLAG_new,FLAG_diff
0,str_001_s,78.178,78.326,0.149,21.181,21.258,0.077,0.641,0.416,-0.225
1,str_002_ec,95.527,96.413,0.886,4.417,3.531,-0.886,0.056,0.056,0.0
2,str_003_ec,97.447,97.953,0.507,1.946,1.612,-0.333,0.608,0.434,-0.173
3,str_003b_ec,98.825,98.91,0.085,1.054,1.032,-0.022,0.121,0.058,-0.064
4,str_004_asec,56.322,56.142,-0.18,43.141,43.522,0.381,0.537,0.336,-0.201
5,str_005_ec,94.827,95.839,1.012,5.101,4.105,-0.996,0.072,0.056,-0.016
6,cs_001_sec,58.469,58.544,0.075,41.183,41.32,0.137,0.348,0.136,-0.212
7,cs_003_sec,39.46,38.921,-0.538,59.875,60.697,0.823,0.665,0.381,-0.284
8,cs_002_ec,95.935,96.064,0.128,3.808,3.881,0.074,0.257,0.055,-0.202
9,str_006_sec,63.15,63.571,0.421,36.222,36.023,-0.199,0.628,0.406,-0.222


In [22]:
# save to downloads as csv result_df and impact_df
result_df.to_csv(r"C:\Users\n740789\Downloads\result_df_with_ovr.csv", index=False)
impact_df.to_csv(r"C:\Users\n740789\Downloads\impact_df_with_ovr.csv", index=False)
final.to_csv(r"C:\Users\n740789\Downloads\final.csv", index=False)