In [17]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [18]:
CSV_PATH = "final_data_new_labels.csv"
IMAGES_ROOT = "processed_data"

OUT_DIR = Path("report_dataset_stats")
OUT_DIR.mkdir(exist_ok=True)

assert os.path.isfile(CSV_PATH), f"CSV not found: {CSV_PATH}"
assert os.path.isdir(IMAGES_ROOT), f"Image folder not found: {IMAGES_ROOT}"

print("CSV_PATH:", CSV_PATH)
print("IMAGES_ROOT:", IMAGES_ROOT)
print("OUT_DIR:", OUT_DIR.resolve())

CSV_PATH: final_data_new_labels.csv
IMAGES_ROOT: processed_data
OUT_DIR: /Users/lisawang/Cornell/25Fall/AML/final/Vision-Based-Safety-Assessment-for-Pedestrian-Street-Crossing/4-dataset_and_dataloader/report_dataset_stats


In [20]:
df = pd.read_csv(CSV_PATH)

print("shape:", df.shape)
print("columns:", df.columns.tolist())

# 基本欄位存在性檢查（不在就直接讓你早點發現）
required_cols = [
    "new_filename", "safe_to_walk", "roadway_width",
    "crosswalk", "crosswalk_signal", "traffic_light",
    "car", "scooter", "bike", "other_obstacles",
    "no_obstacle_in_crosswalk", "weather", "subset"
]
missing_required = [c for c in required_cols if c not in df.columns]
print("missing_required_cols:", missing_required)

shape: (1148, 18)
columns: ['Unnamed: 0', 'path', 'filename', 'split_filename', 'original_filename', 'new_filename', 'safe_to_walk', 'roadway_width', 'crosswalk', 'crosswalk_signal', 'traffic_light', 'car', 'scooter', 'bike', 'other_obstacles', 'no_obstacle_in_crosswalk', 'weather', 'subset']
missing_required_cols: []


In [21]:
subset_counts = df["subset"].value_counts(dropna=False)
subset_pct = (df["subset"].value_counts(normalize=True, dropna=False) * 100).round(2)

print("Subset counts:")
print(subset_counts)
print("\nSubset %:")
print(subset_pct)

subset_counts.to_csv(OUT_DIR / "subset_counts.csv")
subset_pct.to_csv(OUT_DIR / "subset_percent.csv")

Subset counts:
subset
train       956
reserved     78
val          57
test         57
Name: count, dtype: int64

Subset %:
subset
train       83.28
reserved     6.79
val          4.97
test         4.97
Name: proportion, dtype: float64


In [22]:
safe_col = "safe_to_walk"

safe_counts = df[safe_col].value_counts(dropna=False).sort_index()
safe_pct = (df[safe_col].value_counts(normalize=True, dropna=False).sort_index() * 100).round(2)

print("Overall safe_to_walk counts:")
print(safe_counts)
print("\nOverall safe_to_walk %:")
print(safe_pct)

safe_by_subset_counts = pd.crosstab(df["subset"], df[safe_col], dropna=False)
safe_by_subset_pct = (pd.crosstab(df["subset"], df[safe_col], normalize="index", dropna=False) * 100).round(2)

print("\nSafe_to_walk by subset (counts):")
print(safe_by_subset_counts)
print("\nSafe_to_walk by subset (% within subset):")
print(safe_by_subset_pct)

safe_counts.to_csv(OUT_DIR / "safe_to_walk_counts_overall.csv")
safe_pct.to_csv(OUT_DIR / "safe_to_walk_percent_overall.csv")
safe_by_subset_counts.to_csv(OUT_DIR / "safe_to_walk_counts_by_subset.csv")
safe_by_subset_pct.to_csv(OUT_DIR / "safe_to_walk_percent_by_subset.csv")

Overall safe_to_walk counts:
safe_to_walk
0    613
1    535
Name: count, dtype: int64

Overall safe_to_walk %:
safe_to_walk
0    53.4
1    46.6
Name: proportion, dtype: float64

Safe_to_walk by subset (counts):
safe_to_walk    0    1
subset                
reserved       78    0
test           34   23
train         473  483
val            28   29

Safe_to_walk by subset (% within subset):
safe_to_walk       0      1
subset                     
reserved      100.00   0.00
test           59.65  40.35
train          49.48  50.52
val            49.12  50.88


In [23]:
weather_col = "weather"

weather_counts = df[weather_col].value_counts(dropna=False).sort_index()
weather_pct = (df[weather_col].value_counts(normalize=True, dropna=False).sort_index() * 100).round(2)

print("Overall weather counts:")
print(weather_counts)
print("\nOverall weather %:")
print(weather_pct)

weather_by_subset_counts = pd.crosstab(df["subset"], df[weather_col], dropna=False)
weather_by_subset_pct = (pd.crosstab(df["subset"], df[weather_col], normalize="index", dropna=False) * 100).round(2)

print("\nWeather by subset (counts):")
print(weather_by_subset_counts)
print("\nWeather by subset (% within subset):")
print(weather_by_subset_pct)

weather_counts.to_csv(OUT_DIR / "weather_counts_overall.csv")
weather_pct.to_csv(OUT_DIR / "weather_percent_overall.csv")
weather_by_subset_counts.to_csv(OUT_DIR / "weather_counts_by_subset.csv")
weather_by_subset_pct.to_csv(OUT_DIR / "weather_percent_by_subset.csv")

Overall weather counts:
weather
0    741
1    367
2     40
Name: count, dtype: int64

Overall weather %:
weather
0    64.55
1    31.97
2     3.48
Name: proportion, dtype: float64

Weather by subset (counts):
weather     0    1   2
subset                
reserved   55   19   4
test       38   17   2
train     605  317  34
val        43   14   0

Weather by subset (% within subset):
weather       0      1     2
subset                      
reserved  70.51  24.36  5.13
test      66.67  29.82  3.51
train     63.28  33.16  3.56
val       75.44  24.56  0.00


In [24]:
# TODO: 請你把這三個字串改成正確含義，例如 "day", "rain", "night"
weather_map = {0: "TBD_weather0", 1: "TBD_weather1", 2: "TBD_weather2"}

mapped = df[weather_col].map(weather_map)
print(mapped.value_counts(dropna=False))
print((mapped.value_counts(normalize=True, dropna=False) * 100).round(2))

mapped.value_counts(dropna=False).to_csv(OUT_DIR / "weather_semantic_counts_overall.csv")
(mapped.value_counts(normalize=True, dropna=False) * 100).round(2).to_csv(OUT_DIR / "weather_semantic_percent_overall.csv")

weather
TBD_weather0    741
TBD_weather1    367
TBD_weather2     40
Name: count, dtype: int64
weather
TBD_weather0    64.55
TBD_weather1    31.97
TBD_weather2     3.48
Name: proportion, dtype: float64


In [25]:
# 抽每一類 5 張，列出檔名供你去 processed_data/ 打開確認
for w in sorted(df[weather_col].dropna().unique()):
    sample_files = df[df[weather_col] == w]["new_filename"].sample(
        n=min(5, (df[weather_col] == w).sum()),
        random_state=0
    ).tolist()
    print(f"\nweather={w} sample files:")
    for f in sample_files:
        print(" ", f)


weather=0 sample files:
  000320.jpg
  000299.jpg
  000024.jpg
  000052.jpg
  000640.jpg

weather=1 sample files:
  000309.jpg
  000760.jpg
  000128.jpg
  000077.jpg
  000221.jpg

weather=2 sample files:
  000494.jpg
  000469.jpg
  000619.jpg
  000115.jpg
  000332.jpg


In [26]:
binary_cols = [
    "crosswalk", "car", "scooter", "bike",
    "other_obstacles", "no_obstacle_in_crosswalk"
]
binary_cols = [c for c in binary_cols if c in df.columns]

print("Binary columns:", binary_cols)

binary_pos_pct_overall = (df[binary_cols].mean(numeric_only=True) * 100).round(2).sort_values(ascending=False)
binary_pos_pct_by_subset = (df.groupby("subset")[binary_cols].mean(numeric_only=True) * 100).round(2)

print("\nOverall positive rate (%):")
print(binary_pos_pct_overall)

print("\nPositive rate by subset (%):")
print(binary_pos_pct_by_subset)

binary_pos_pct_overall.to_csv(OUT_DIR / "binary_positive_rate_overall_percent.csv")
binary_pos_pct_by_subset.to_csv(OUT_DIR / "binary_positive_rate_by_subset_percent.csv")

Binary columns: ['crosswalk', 'car', 'scooter', 'bike', 'other_obstacles', 'no_obstacle_in_crosswalk']

Overall positive rate (%):
crosswalk                   99.91
no_obstacle_in_crosswalk    71.78
car                         20.73
scooter                      8.54
bike                         3.83
other_obstacles              0.70
dtype: float64

Positive rate by subset (%):
          crosswalk    car  scooter  bike  other_obstacles  \
subset                                                       
reserved      100.0  35.90    21.79  2.56             1.28   
test          100.0  33.33     7.02  7.02             1.75   
train          99.9  18.83     7.64  3.87             0.63   
val           100.0  19.30     7.02  1.75             0.00   

          no_obstacle_in_crosswalk  
subset                              
reserved                     43.59  
test                         59.65  
train                        74.69  
val                          73.68  


In [27]:
multiclass_cols = []
for c in ["traffic_light", "crosswalk_signal"]:
    if c in df.columns:
        multiclass_cols.append(c)

for col in multiclass_cols:
    print(f"\n=== {col}: overall counts ===")
    overall_counts = df[col].value_counts(dropna=False).sort_index()
    print(overall_counts)

    print(f"\n=== {col}: by subset counts ===")
    by_subset_counts = pd.crosstab(df["subset"], df[col], dropna=False)
    print(by_subset_counts)

    overall_counts.to_csv(OUT_DIR / f"{col}_counts_overall.csv")
    by_subset_counts.to_csv(OUT_DIR / f"{col}_counts_by_subset.csv")


=== traffic_light: overall counts ===
traffic_light
0    240
1    319
2    589
Name: count, dtype: int64

=== traffic_light: by subset counts ===
traffic_light    0    1    2
subset                      
reserved        18   19   41
test            12   16   29
train          204  267  485
val              6   17   34

=== crosswalk_signal: overall counts ===
crosswalk_signal
0    372
1    623
2    153
Name: count, dtype: int64

=== crosswalk_signal: by subset counts ===
crosswalk_signal    0    1    2
subset                         
reserved           37   25   16
test               21   30    6
train             300  536  120
val                14   32   11


In [28]:
width_col = "roadway_width"

print(df[width_col].describe())
top_widths = df[width_col].value_counts(dropna=False).head(30)
print("\nTop roadway_width values:")
print(top_widths)

df[width_col].describe().to_csv(OUT_DIR / "roadway_width_describe.csv")
top_widths.to_csv(OUT_DIR / "roadway_width_top_values.csv")

count    1148.000000
mean       19.716376
std         9.842450
min         4.400000
25%        11.100000
50%        18.500000
75%        27.200000
max        59.100000
Name: roadway_width, dtype: float64

Top roadway_width values:
roadway_width
23.0    38
20.0    36
16.7    25
18.5    24
31.2    23
9.2     23
36.3    20
16.6    19
14.1    19
32.2    18
11.8    17
33.4    16
32.6    16
10.3    15
7.8     15
14.2    14
31.6    14
31.9    14
8.8     14
26.5    13
14.4    13
6.7     13
6.5     13
7.2     12
6.4     12
30.8    12
6.6     12
25.8    12
34.3    12
11.5    12
Name: count, dtype: int64


In [29]:
cols_to_summarize = binary_cols + multiclass_cols + [weather_col, safe_col, width_col]

summary_rows = []
for c in cols_to_summarize:
    if c not in df.columns:
        continue
    nunique = df[c].nunique(dropna=False)
    unique_vals = sorted(df[c].dropna().unique().tolist())
    summary_rows.append({
        "column": c,
        "n_unique": nunique,
        "unique_vals_first_20": unique_vals[:20]
    })

summary_df = pd.DataFrame(summary_rows)
print(summary_df)

summary_df.to_csv(OUT_DIR / "label_type_summary.csv", index=False)

                      column  n_unique  \
0                  crosswalk         2   
1                        car         2   
2                    scooter         2   
3                       bike         2   
4            other_obstacles         2   
5   no_obstacle_in_crosswalk         2   
6              traffic_light         3   
7           crosswalk_signal         3   
8                    weather         3   
9               safe_to_walk         2   
10             roadway_width       157   

                                 unique_vals_first_20  
0                                              [0, 1]  
1                                              [0, 1]  
2                                              [0, 1]  
3                                              [0, 1]  
4                                              [0, 1]  
5                                              [0, 1]  
6                                           [0, 1, 2]  
7                                           [0,

In [30]:
missing_rate = (df.isna().mean() * 100).sort_values(ascending=False)
print(missing_rate.head(30).round(2))

missing_rate.to_csv(OUT_DIR / "missing_rate_percent.csv")

Unnamed: 0                  0.0
path                        0.0
weather                     0.0
no_obstacle_in_crosswalk    0.0
other_obstacles             0.0
bike                        0.0
scooter                     0.0
car                         0.0
traffic_light               0.0
crosswalk_signal            0.0
crosswalk                   0.0
roadway_width               0.0
safe_to_walk                0.0
new_filename                0.0
original_filename           0.0
split_filename              0.0
filename                    0.0
subset                      0.0
dtype: float64


In [31]:
missing_files = []
for f in df["new_filename"].astype(str):
    if not os.path.isfile(os.path.join(IMAGES_ROOT, f)):
        missing_files.append(f)

print("missing image files:", len(missing_files))
print("first 10 missing:", missing_files[:10])

pd.Series(missing_files).to_csv(OUT_DIR / "missing_files_list.csv", index=False)

missing image files: 0
first 10 missing: []


In [32]:
report_stats = {}

report_stats["n_images_total"] = int(df.shape[0])
report_stats["n_cols"] = int(df.shape[1])

# subset sizes
report_stats["subset_counts"] = subset_counts.to_dict()
report_stats["subset_percent"] = subset_pct.to_dict()

# safe distribution
report_stats["safe_counts_overall"] = safe_counts.to_dict()
report_stats["safe_percent_overall"] = safe_pct.to_dict()

# weather distribution (codes)
report_stats["weather_counts_overall"] = weather_counts.to_dict()
report_stats["weather_percent_overall"] = weather_pct.to_dict()

# binary positive rates
report_stats["binary_positive_rate_overall_percent"] = binary_pos_pct_overall.to_dict()

# multiclass overall counts
for col in multiclass_cols:
    report_stats[f"{col}_counts_overall"] = df[col].value_counts(dropna=False).sort_index().to_dict()

# roadway width stats
report_stats["roadway_width_min"] = float(df[width_col].min())
report_stats["roadway_width_median"] = float(df[width_col].median())
report_stats["roadway_width_mean"] = float(df[width_col].mean())
report_stats["roadway_width_max"] = float(df[width_col].max())

# missing files
report_stats["missing_image_files_count"] = int(len(missing_files))

report_stats

{'n_images_total': 1148,
 'n_cols': 18,
 'subset_counts': {'train': 956, 'reserved': 78, 'val': 57, 'test': 57},
 'subset_percent': {'train': 83.28,
  'reserved': 6.79,
  'val': 4.97,
  'test': 4.97},
 'safe_counts_overall': {0: 613, 1: 535},
 'safe_percent_overall': {0: 53.4, 1: 46.6},
 'weather_counts_overall': {0: 741, 1: 367, 2: 40},
 'weather_percent_overall': {0: 64.55, 1: 31.97, 2: 3.48},
 'binary_positive_rate_overall_percent': {'crosswalk': 99.91,
  'no_obstacle_in_crosswalk': 71.78,
  'car': 20.73,
  'scooter': 8.54,
  'bike': 3.83,
  'other_obstacles': 0.7},
 'traffic_light_counts_overall': {0: 240, 1: 319, 2: 589},
 'crosswalk_signal_counts_overall': {0: 372, 1: 623, 2: 153},
 'roadway_width_min': 4.4,
 'roadway_width_median': 18.5,
 'roadway_width_mean': 19.71637630662021,
 'roadway_width_max': 59.1,
 'missing_image_files_count': 0}