In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
target_columns = [
    "permId",
    "issuer_name",
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
    ]
target_col_ovr = [
    "permid",
    "issuer_name",
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
    ]

In [4]:
def process_file(path:str, target_columns:list=target_columns) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        usecols=target_columns,
        dtype={"permId":str, "permid":str}
    )
    df.columns = df.columns.str.lower()
    target_columns_lower = [col.lower() for col in target_columns]
    df = df[target_columns_lower]
    
    return df

In [6]:
df_inherit = process_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\inheritance_analysis\20250201_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\n740789\\Documents\\Projects_local\\DataSets\\inheritance_analysis\\20250201_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv'

In [1]:
df = process_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\inheritance_analysis\20250301_Equities_feed_new_strategies_filtered_old_names_iso_permId.csv")

NameError: name 'process_file' is not defined

In [6]:
ovr = process_file(r"C:\Users\n740789\Documents\Projects_local\DataSets\DATAFEED\datafeeds_with_ovr\20250301_datafeed_with_ovr.csv", target_columns=target_col_ovr)

In [7]:
# remove duplicate on permid column
df = df.drop_duplicates(subset=["permid"])
df_inherit = df_inherit.drop_duplicates(subset=["permid"])
ovr = ovr.drop_duplicates(subset=["permid"])

In [8]:
final = pd.merge(df, df_inherit, how="left", on=["permid","issuer_name"], suffixes=("_oihr","_nihr"))

In [9]:
# Merge final with ovr
over_suffixes = {col: f"{col}_ovr" for col in ovr.columns if col not in ["permid", "issuer_name"]}
ovr = ovr.rename(columns=over_suffixes)
final = final.merge(ovr, how="left", on=["permid", "issuer_name"])

In [10]:
# Sort columns
sorted_columns = ["permid", "issuer_name"]
for strategy in [
    "str_001_s", "str_002_ec", "str_003_ec", "str_003b_ec", "str_004_asec", "str_005_ec",
    "cs_001_sec", "cs_003_sec", "cs_002_ec", "str_006_sec", "str_007_sect", "art_8_basicos",
    "gp_esccp", "gp_esccp_22", "gp_esccp_25", "gp_esccp_30"
]:
    sorted_columns.extend([f"{strategy}_oihr", f"{strategy}_nihr", f"{strategy}_ovr"])

final = final[sorted_columns]

DATE = datetime.now().strftime("%Y%m%d")

# Save to Excel
output_file = rf"C:\Users\n740789\Downloads\{DATE}_inheritance_analysis_with_ovr.xlsx"
with pd.ExcelWriter(output_file) as writer:
    # Save summary
    final.to_excel(writer, sheet_name="summary", index=False)
    
    # Save each strategy separately
    for strategy in [
        "str_001_s", "str_002_ec", "str_003_ec", "str_003b_ec", "str_004_asec", "str_005_ec",
        "cs_001_sec", "cs_003_sec", "cs_002_ec", "str_006_sec", "str_007_sect", "art_8_basicos",
        "gp_esccp", "gp_esccp_22", "gp_esccp_25", "gp_esccp_30"
    ]:
        columns_to_save = ["permid", "issuer_name", f"{strategy}_oihr", f"{strategy}_nihr", f"{strategy}_ovr"]
        final[columns_to_save].to_excel(writer, sheet_name=strategy, index=False)

print(f"Excel file saved as {output_file}")


Excel file saved as C:\Users\n740789\Downloads\20250228_inheritance_analusis_with_ovr.xlsx


In [None]:
stop

In [8]:
# save first 50 rows
#final.head(50).to_csv(r"C:\Users\n740789\Downloads\inheritance_analusis_50.csv", index=False)

In [9]:
def calculate_strategy_impact(df, strategies):
    """
    Calculate the percentage of transitions between states for a list of strategies.

    Parameters:
        df (pd.DataFrame): The dataframe containing twin columns for each strategy.
                           For each strategy, the old value column should be named <strategy>_oihr
                           and the new value column should be named <strategy>_nihr.
        strategies (list): List of strategy base names (without suffixes).

    Returns:
        pd.DataFrame: A DataFrame with one row per strategy and columns for each impact transition:
                      'OK_to_EXCLUDED', 'OK_to_FLAG', 'EXCLUDED_to_OK', 'EXCLUDED_to_FLAG'
    """
    
    results = []
    
    for strat in strategies:
        old_col = f"{strat}_oihr"
        new_col = f"{strat}_nihr"
        
        # Initialize a dict for this strategy
        impact = {"strategy": strat}
        
        # ----- Transitions from OK -----
        # Filter rows with "OK" in the old column.
        df_ok = df[df[old_col] == "OK"]
        total_ok = len(df_ok)
        
        if total_ok > 0:
            count_ok_to_excluded = (df_ok[new_col] == "EXCLUDED").sum()
            count_ok_to_flag = (df_ok[new_col] == "FLAG").sum()
            impact["OK_to_EXCLUDED"] = (count_ok_to_excluded / total_ok) * 100
            impact["OK_to_FLAG"] = (count_ok_to_flag / total_ok) * 100
        else:
            impact["OK_to_EXCLUDED"] = None
            impact["OK_to_FLAG"] = None
        
        # ----- Transitions from EXCLUDED -----
        # Filter rows with "EXCLUDED" in the old column.
        df_excluded = df[df[old_col] == "EXCLUDED"]
        total_excluded = len(df_excluded)
        
        if total_excluded > 0:
            count_excluded_to_ok = (df_excluded[new_col] == "OK").sum()
            count_excluded_to_flag = (df_excluded[new_col] == "FLAG").sum()
            impact["EXCLUDED_to_OK"] = (count_excluded_to_ok / total_excluded) * 100
            impact["EXCLUDED_to_FLAG"] = (count_excluded_to_flag / total_excluded) * 100
        else:
            impact["EXCLUDED_to_OK"] = None
            impact["EXCLUDED_to_FLAG"] = None

        # ----- Transitions from FLAG -----
        # Filter rows with "FLAG" in the old column.
        df_flag = df[df[old_col] == "FLAG"]
        total_flag = len(df_flag)

        if total_flag > 0:
            count_flag_to_ok = (df_flag[new_col] == "OK").sum()
            count_flag_to_excluded = (df_flag[new_col] == "EXCLUDED").sum()
            impact["FLAG_to_OK"] = (count_flag_to_ok / total_flag) * 100
            impact["FLAG_to_EXCLUDED"] = (count_flag_to_excluded / total_flag) * 100
        else:
            impact["FLAG_to_OK"] = None
            impact["FLAG_to_EXCLUDED"] = None
        
        results.append(impact)

        # if col num the round to 3 decimal places
        results = [{k: round(v,3) if isinstance(v, float) else v for k, v in d.items()} for d in results]
        # if col num the format to percentage
        results = [{k: f"{v:.2f}%" if isinstance(v, float) else v for k, v in d.items()} for d in results]
    
    return pd.DataFrame(results)


In [10]:
strategies = [
    "str_001_s",
    "str_002_ec",
    "str_003_ec",
    "str_003b_ec",
    "str_004_asec",
    "str_005_ec",
    "cs_001_sec",
    "cs_003_sec",
    "cs_002_ec",
    "str_006_sec",
    "str_007_sect",
    "art_8_basicos",
    "gp_esccp",
    "gp_esccp_22",
    "gp_esccp_25",
    "gp_esccp_30"
]

In [11]:
impact_df = calculate_strategy_impact(final, strategies)


In [12]:
impact_df

Unnamed: 0,strategy,OK_to_EXCLUDED,OK_to_FLAG,EXCLUDED_to_OK,EXCLUDED_to_FLAG,FLAG_to_OK,FLAG_to_EXCLUDED
0,str_001_s,0.16%,0.00%,0.00%,0.00%,0.00%,0.00%
1,str_002_ec,0.30%,0.00%,0.00%,0.00%,0.00%,0.00%
2,str_003_ec,0.13%,0.00%,0.00%,0.00%,0.00%,0.00%
3,str_003b_ec,0.13%,0.00%,0.00%,0.00%,0.00%,0.00%
4,str_004_asec,0.74%,0.00%,0.00%,0.00%,0.00%,0.85%
5,str_005_ec,0.37%,0.00%,0.00%,0.00%,0.00%,0.00%
6,cs_001_sec,0.34%,0.00%,0.00%,0.00%,0.00%,0.00%
7,cs_003_sec,0.20%,0.00%,0.00%,0.00%,0.00%,0.00%
8,cs_002_ec,0.59%,0.00%,0.00%,0.00%,0.00%,0.00%
9,str_006_sec,0.40%,0.00%,0.00%,0.00%,0.00%,0.71%


In [13]:
def compare_strategies(df, strategies):
    """
    For each strategy, compute the percentage of rows labeled as 'OK', 'EXCLUDED', or 'FLAG'
    in the old (oihr) and new (nihr) columns. Also compute the difference in percentages
    between old and new.

    Returns a DataFrame with columns:
        strategy,
        OK_old, OK_new, OK_diff,
        EXCL_old, EXCL_new, EXCL_diff,
        FLAG_old, FLAG_new, FLAG_diff
    """
    
    total_rows = len(df)
    results = []

    for strat in strategies:
        old_col = f"{strat}_oihr"
        new_col = f"{strat}_nihr"
        
        # Count how many rows are "OK", "EXCLUDED", "FLAG" for old columns
        old_ok_count = (df[old_col] == "OK").sum()
        old_excl_count = (df[old_col] == "EXCLUDED").sum()
        old_flag_count = (df[old_col] == "FLAG").sum()

        # Count how many rows are "OK", "EXCLUDED", "FLAG" for new columns
        new_ok_count = (df[new_col] == "OK").sum()
        new_excl_count = (df[new_col] == "EXCLUDED").sum()
        new_flag_count = (df[new_col] == "FLAG").sum()

        # Convert counts to percentages
        old_ok_pct = old_ok_count / total_rows * 100
        old_excl_pct = old_excl_count / total_rows * 100
        old_flag_pct = old_flag_count / total_rows * 100

        new_ok_pct = new_ok_count / total_rows * 100
        new_excl_pct = new_excl_count / total_rows * 100
        new_flag_pct = new_flag_count / total_rows * 100

        # Differences (new - old)
        ok_diff = new_ok_pct - old_ok_pct
        excl_diff = new_excl_pct - old_excl_pct
        flag_diff = new_flag_pct - old_flag_pct

        results.append({
            "strategy": strat,
            "OK_old": old_ok_pct,
            "OK_new": new_ok_pct,
            "OK_diff": ok_diff,
            "EXCL_old": old_excl_pct,
            "EXCL_new": new_excl_pct,
            "EXCL_diff": excl_diff,
            "FLAG_old": old_flag_pct,
            "FLAG_new": new_flag_pct,
            "FLAG_diff": flag_diff
        })
    
    # Build the results DataFrame
    df_out = pd.DataFrame(results)
    
    # Round numeric columns to three decimals
    cols_to_round = [
        "OK_old", "OK_new", "OK_diff",
        "EXCL_old", "EXCL_new", "EXCL_diff",
        "FLAG_old", "FLAG_new", "FLAG_diff"
    ]
    df_out[cols_to_round] = df_out[cols_to_round].round(3)
    
    return df_out


In [14]:
result_df = compare_strategies(final, strategies)

In [15]:
result_df

Unnamed: 0,strategy,OK_old,OK_new,OK_diff,EXCL_old,EXCL_new,EXCL_diff,FLAG_old,FLAG_new,FLAG_diff
0,str_001_s,78.455,78.326,-0.128,21.129,21.258,0.128,0.416,0.416,0.0
1,str_002_ec,96.705,96.413,-0.292,3.239,3.531,0.292,0.056,0.056,0.0
2,str_003_ec,98.082,97.953,-0.128,1.484,1.612,0.128,0.434,0.434,0.0
3,str_003b_ec,99.039,98.91,-0.128,0.904,1.032,0.128,0.058,0.058,0.0
4,str_004_asec,56.559,56.142,-0.417,43.102,43.522,0.42,0.339,0.336,-0.003
5,str_005_ec,96.198,95.839,-0.359,3.746,4.105,0.359,0.056,0.056,0.0
6,cs_001_sec,58.746,58.544,-0.202,41.118,41.32,0.202,0.136,0.136,0.0
7,cs_003_sec,39.001,38.921,-0.079,60.618,60.697,0.079,0.381,0.381,0.0
8,cs_002_ec,96.63,96.064,-0.566,3.316,3.881,0.566,0.055,0.055,0.0
9,str_006_sec,63.828,63.571,-0.257,35.763,36.023,0.26,0.408,0.406,-0.003


In [16]:
# save to downloads as csv result_df and impact_df
result_df.to_csv(r"C:\Users\n740789\Downloads\result_df.csv", index=False)
impact_df.to_csv(r"C:\Users\n740789\Downloads\impact_df.csv", index=False)
final.to_csv(r"C:\Users\n740789\Downloads\final.csv", index=False)