In [1]:
# Load the features parquet file
import pandas as pd

features_df = pd.read_parquet('../artifacts/features_long.parquet')
print(f"✅ Loaded features: {features_df.shape}")
print(f"Columns: {list(features_df.columns)}")
print(f"Date range: {features_df['date'].min()} to {features_df['date'].max()}")
print(f"Unique symbols: {features_df['symbol'].nunique()}")
print(f"Sample data:")
print(features_df.tail())

✅ Loaded features: (2904927, 147)
Columns: ['high', 'open', 'adjclose', 'volume', 'close', 'low', 'ret', 'ma_10', 'pct_slope_ma_10', 'sign_ma_10', 'ma_20', 'pct_slope_ma_20', 'sign_ma_20', 'ma_30', 'pct_slope_ma_30', 'sign_ma_30', 'ma_50', 'pct_slope_ma_50', 'sign_ma_50', 'ma_75', 'pct_slope_ma_75', 'sign_ma_75', 'ma_100', 'pct_slope_ma_100', 'sign_ma_100', 'ma_150', 'pct_slope_ma_150', 'sign_ma_150', 'ma_200', 'pct_slope_ma_200', 'sign_ma_200', 'trend_score_granular', 'trend_score_sign', 'trend_score_slope', 'trend_persist_ema', 'trend_alignment', 'rv_10', 'rv_20', 'rv_60', 'rv_100', 'rv_ratio_10_60', 'rv_ratio_20_100', 'vol_regime', 'vol_regime_ema10', 'rv_z_60', 'vol_of_vol_20d', 'rv60_slope_norm', 'rv100_slope_norm', 'quiet_trend', 'hurst_ret_64', 'hurst_ret_128', 'hurst_ret_64_emaHL5', 'pct_dist_ma_20', 'pct_dist_ma_20_z', 'pct_dist_ma_50', 'pct_dist_ma_50_z', 'pct_dist_ma_100', 'pct_dist_ma_100_z', 'pct_dist_ma_200', 'pct_dist_ma_200_z', 'min_pct_dist_ma', 'relative_dist_20_50', 

In [2]:
import numpy as np
import pandas as pd

# Only numeric columns for inf check
numeric_cols = features_df.select_dtypes(include=[np.number]).columns

nan_counts = features_df.isna().sum()
inf_counts = pd.Series(0, index=features_df.columns)

inf_counts[numeric_cols] = np.isinf(features_df[numeric_cols].to_numpy()).sum(axis=0)

nan_inf_summary = pd.DataFrame({
    "NaN_count": nan_counts,
    "Inf_count": inf_counts,
    "Total_rows": len(features_df)
})

nan_inf_summary["NaN_pct"] = nan_inf_summary["NaN_count"] / nan_inf_summary["Total_rows"] * 100
nan_inf_summary["Inf_pct"] = nan_inf_summary["Inf_count"] / nan_inf_summary["Total_rows"] * 100

nan_inf_summary.sort_values(["NaN_count", "Inf_count"], ascending=False)
# Filter to top offenders by max percentage of NaNs/Infs
top_nan_inf = (
    nan_inf_summary
    .assign(Max_pct=lambda df: df[["NaN_pct", "Inf_pct"]].max(axis=1))
    .sort_values("Max_pct", ascending=False)
)

# Show top 20 offenders
display(top_nan_inf.head(20))

# Only numeric columns for inf check
numeric_cols = features_df.select_dtypes(include=[np.number]).columns

nan_counts = features_df.isna().sum()
inf_counts = pd.Series(0, index=features_df.columns)

inf_counts[numeric_cols] = np.isinf(features_df[numeric_cols].to_numpy()).sum(axis=0)

nan_inf_summary = pd.DataFrame({
    "NaN_count": nan_counts,
    "Inf_count": inf_counts,
    "Total_rows": len(features_df)
})

nan_inf_summary["NaN_pct"] = nan_inf_summary["NaN_count"] / nan_inf_summary["Total_rows"] * 100
nan_inf_summary["Inf_pct"] = nan_inf_summary["Inf_count"] / nan_inf_summary["Total_rows"] * 100

# Filter to columns with any inf values
inf_only = nan_inf_summary.query("Inf_count > 0")

print(f"Columns with Infs ({len(inf_only)} found):")
display(inf_only.sort_values("Inf_count", ascending=False))

Unnamed: 0,NaN_count,Inf_count,Total_rows,NaN_pct,Inf_pct,Max_pct
alpha_mom_combo_120_ema10,625110,0,2904927,21.518957,0.0,21.518957
alpha_mom_sector_120_ema10,625110,0,2904927,21.518957,0.0,21.518957
alpha_mom_sector_60_ema10,582965,0,2904927,20.068146,0.0,20.068146
alpha_mom_combo_60_ema10,582965,0,2904927,20.068146,0.0,20.068146
alpha_mom_combo_20_ema10,553425,0,2904927,19.051253,0.0,19.051253
alpha_mom_sector_20_ema10,553425,0,2904927,19.051253,0.0,19.051253
alpha_resid_sector,544993,0,2904927,18.760988,0.0,18.760988
alpha_mom_sector_ema10,542875,0,2904927,18.688077,0.0,18.688077
alpha_mom_combo_ema10,542875,0,2904927,18.688077,0.0,18.688077
pct_dist_ma_200_z,471154,0,2904927,16.219134,0.0,16.219134


Columns with Infs (0 found):


Unnamed: 0,NaN_count,Inf_count,Total_rows,NaN_pct,Inf_pct
