In [None]:
# --- Distribution audit using already-loaded DataFrames: train, test, orig ---
import pandas as pd
import numpy as np

# If you DIDN'T pre-load them, uncomment these:
train = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')
test  = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
orig  = pd.concat([
    pd.read_csv('/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_100k.csv'),
    pd.read_csv('/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_10k.csv'),
    pd.read_csv('/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_2k.csv'),
], ignore_index=True)

CATS = ["road_type","time_of_day","weather","lighting","holiday","school_season"]
HAS_TARGET = "accident_risk" in train.columns

def summarize_feature(df: pd.DataFrame, col: str, with_target: bool) -> pd.DataFrame:
    if with_target and "accident_risk" in df.columns:
        # FIX: SeriesGroupBy named aggregation uses strings, not (col, func) tuples
        gb = df.groupby(col)["accident_risk"].agg(rows="count", mean="mean", std="std")
        out = gb.reset_index().sort_values("rows", ascending=False)
    else:
        out = (df.groupby(col)
                 .size()
                 .reset_index(name="rows")
                 .sort_values("rows", ascending=False))
        out["mean"] = np.nan
        out["std"]  = np.nan
    out["share"] = (out["rows"] / len(df)).round(4)
    return out[[col, "rows", "share", "mean", "std"]]

def binned_accidents(df: pd.DataFrame, q: int = 10):
    if "num_reported_accidents" not in df.columns:
        return None
    return pd.qcut(df["num_reported_accidents"], q=q, duplicates="drop")

print("=== Shapes ===")
print(f"train: {train.shape} | test: {test.shape} | orig: {orig.shape}\n")

# Overall target stats (train only)
if HAS_TARGET:
    print("=== Train target stats ===")
    print(train["accident_risk"].agg(["count","mean","std","min","max"]).to_string(), "\n")

results = {}

# Per-column summaries
for col in CATS:
    print(f"=== {col} (train) ===")
    tr = summarize_feature(train, col, with_target=True)
    display(tr.head(20))
    results[f"train_{col}"] = tr

    print(f"=== {col} (test) ===")
    te = summarize_feature(test, col, with_target=False)
    display(te.head(20))
    results[f"test_{col}"] = te

    print(f"=== {col} (orig) ===")
    og = summarize_feature(orig, col, with_target=("accident_risk" in orig.columns))
    display(og.head(20))
    results[f"orig_{col}"] = og
    print()

# Binned accident history
train_bins = binned_accidents(train, q=10)
if train_bins is not None:
    dfb = train.assign(num_acc_bin=train_bins)
    print("=== num_reported_accidents (binned) — train ===")
    summ = (dfb.groupby("num_acc_bin")["accident_risk"]
              .agg(rows="count", mean="mean", std="std")
              .reset_index()
              .sort_values("rows", ascending=False))
    summ["share"] = (summ["rows"]/len(train)).round(4)
    display(summ)
    results["train_num_acc_bins"] = summ
    print()

# Composite key coverage to spot rare combos (great for Group/StratifiedGroupKFold)
def composite_key(df: pd.DataFrame) -> pd.Series:
    cols = ["road_type","time_of_day","weather"]  # adjust if needed
    return df[cols].astype(str).agg("|".join, axis=1)

print("=== Composite key coverage (road_type|time_of_day|weather) ===")
train["ckey"] = composite_key(train)
test["ckey"]  = composite_key(test)
orig["ckey"]  = composite_key(orig)

train_ck = (train["ckey"].value_counts() / len(train)).rename("share_train")
test_ck  = (test["ckey"].value_counts()  / len(test)).rename("share_test")
orig_ck  = (orig["ckey"].value_counts()  / len(orig)).rename("share_orig")

ck = pd.concat([train_ck, test_ck, orig_ck], axis=1).fillna(0).sort_values("share_test", ascending=False)
ck["abs_gap_train_vs_test"] = (ck["share_train"] - ck["share_test"]).abs().round(4)
ck["abs_gap_orig_vs_test"]  = (ck["share_orig"]  - ck["share_test"]).abs().round(4)
display(ck.head(30))

# Flag rare combos in train that might cause fold sparsity
rare_threshold = 0.002  # 0.2% of rows — tweak as needed
rare_train = ck[ck["share_train"] < rare_threshold].sort_values("share_train")
print(f"Rare combos in train (<{rare_threshold*100:.1f}% of rows): {len(rare_train)}")
display(rare_train.head(30))

print("\n--- Use this to pick stratification/group keys for your new CV splits. ---")


=== Shapes ===
train: (517754, 14) | test: (172585, 13) | orig: (112000, 13)

=== Train target stats ===
count    517754.000000
mean          0.352377
std           0.166417
min           0.000000
max           1.000000 

=== road_type (train) ===


Unnamed: 0,road_type,rows,share,mean,std
0,highway,173672,0.3354,0.349734,0.165922
1,rural,172719,0.3336,0.349997,0.167185
2,urban,171363,0.331,0.357456,0.166027


=== road_type (test) ===


Unnamed: 0,road_type,rows,share,mean,std
0,highway,58080,0.3365,,
1,rural,57409,0.3326,,
2,urban,57096,0.3308,,


=== road_type (orig) ===


Unnamed: 0,road_type,rows,share,mean,std
0,highway,37467,0.3345,0.382364,0.17897
2,urban,37455,0.3344,0.383942,0.178828
1,rural,37078,0.3311,0.382619,0.179274



=== time_of_day (train) ===


Unnamed: 0,time_of_day,rows,share,mean,std
2,morning,173410,0.3349,0.350966,0.167221
1,evening,172837,0.3338,0.354736,0.164505
0,afternoon,171507,0.3313,0.351428,0.167491


=== time_of_day (test) ===


Unnamed: 0,time_of_day,rows,share,mean,std
2,morning,57747,0.3346,,
1,evening,57629,0.3339,,
0,afternoon,57209,0.3315,,


=== time_of_day (orig) ===


Unnamed: 0,time_of_day,rows,share,mean,std
2,morning,37366,0.3336,0.381874,0.179188
0,afternoon,37326,0.3333,0.383335,0.178996
1,evening,37308,0.3331,0.38372,0.178883



=== weather (train) ===


Unnamed: 0,weather,rows,share,mean,std
1,foggy,181463,0.3505,0.386305,0.167578
0,clear,179306,0.3463,0.31006,0.164891
2,rainy,156985,0.3032,0.361494,0.156094


=== weather (test) ===


Unnamed: 0,weather,rows,share,mean,std
1,foggy,60236,0.349,,
0,clear,59982,0.3476,,
2,rainy,52367,0.3034,,


=== weather (orig) ===


Unnamed: 0,weather,rows,share,mean,std
0,clear,37587,0.3356,0.316538,0.171384
1,foggy,37262,0.3327,0.41597,0.173314
2,rainy,37151,0.3317,0.4171,0.173361



=== lighting (train) ===


Unnamed: 0,lighting,rows,share,mean,std
1,dim,183826,0.355,0.300109,0.141979
0,daylight,178015,0.3438,0.302923,0.142827
2,night,155913,0.3011,0.470467,0.15796


=== lighting (test) ===


Unnamed: 0,lighting,rows,share,mean,std
1,dim,61143,0.3543,,
0,daylight,59397,0.3442,,
2,night,52045,0.3016,,


=== lighting (orig) ===


Unnamed: 0,lighting,rows,share,mean,std
1,dim,37537,0.3352,0.316692,0.152372
2,night,37375,0.3337,0.514757,0.153487
0,daylight,37088,0.3311,0.317262,0.152577



=== holiday (train) ===


Unnamed: 0,holiday,rows,share,mean,std
1,True,260688,0.5035,0.360827,0.167821
0,False,257066,0.4965,0.343809,0.164539


=== holiday (test) ===


Unnamed: 0,holiday,rows,share,mean,std
1,True,87125,0.5048,,
0,False,85460,0.4952,,


=== holiday (orig) ===


Unnamed: 0,holiday,rows,share,mean,std
1,True,56344,0.5031,0.382695,0.178881
0,False,55656,0.4969,0.38326,0.179168



=== school_season (train) ===


Unnamed: 0,school_season,rows,share,mean,std
0,False,260164,0.5025,0.352539,0.167225
1,True,257590,0.4975,0.352214,0.165597


=== school_season (test) ===


Unnamed: 0,school_season,rows,share,mean,std
0,False,86546,0.5015,,
1,True,86039,0.4985,,


=== school_season (orig) ===


Unnamed: 0,school_season,rows,share,mean,std
0,False,56066,0.5006,0.383138,0.179494
1,True,55934,0.4994,0.382813,0.178551



=== num_reported_accidents (binned) — train ===


Unnamed: 0,num_acc_bin,rows,mean,std,share
0,"(-0.001, 1.0]",334943,0.333714,0.155347,0.6469
1,"(1.0, 2.0]",145965,0.342967,0.158062,0.2819
2,"(2.0, 7.0]",36846,0.559311,0.156446,0.0712



=== Composite key coverage (road_type|time_of_day|weather) ===


Unnamed: 0_level_0,share_train,share_test,share_orig,abs_gap_train_vs_test,abs_gap_orig_vs_test
ckey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
urban|evening|foggy,0.040963,0.041267,0.036946,0.0003,0.0043
highway|morning|clear,0.040228,0.040774,0.03808,0.0005,0.0027
highway|evening|foggy,0.040092,0.039951,0.037607,0.0001,0.0023
rural|evening|foggy,0.04028,0.039893,0.036134,0.0004,0.0038
highway|afternoon|clear,0.039691,0.039789,0.036973,0.0001,0.0028
rural|afternoon|clear,0.039109,0.039459,0.037848,0.0003,0.0016
rural|morning|clear,0.039322,0.039395,0.036964,0.0001,0.0024
highway|afternoon|foggy,0.03836,0.039239,0.036991,0.0009,0.0022
highway|evening|clear,0.038694,0.038833,0.037884,0.0001,0.0009
rural|afternoon|foggy,0.038503,0.038526,0.036304,0.0,0.0022


Rare combos in train (<0.2% of rows): 0


Unnamed: 0_level_0,share_train,share_test,share_orig,abs_gap_train_vs_test,abs_gap_orig_vs_test
ckey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1



--- Use this to pick stratification/group keys for your new CV splits. ---
