In [1]:
import pandas as pd

df = pd.read_csv("final_data_new_labels.csv")
df.shape, df.columns.tolist()


((1148, 18),
 ['Unnamed: 0',
  'path',
  'filename',
  'split_filename',
  'original_filename',
  'new_filename',
  'safe_to_walk',
  'roadway_width',
  'crosswalk',
  'crosswalk_signal',
  'traffic_light',
  'car',
  'scooter',
  'bike',
  'other_obstacles',
  'no_obstacle_in_crosswalk',
  'weather',
  'subset'])

In [2]:
df["weather"].value_counts(dropna=False).sort_index()
(df["weather"].value_counts(normalize=True, dropna=False).sort_index() * 100).round(2)


weather
0    64.55
1    31.97
2     3.48
Name: proportion, dtype: float64

In [3]:
pd.crosstab(df["subset"], df["weather"], dropna=False)
(pd.crosstab(df["subset"], df["weather"], normalize="index", dropna=False) * 100).round(2)


weather,0,1,2
subset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reserved,70.51,24.36,5.13
test,66.67,29.82,3.51
train,63.28,33.16,3.56
val,75.44,24.56,0.0


In [4]:
safe_col = "safe_to_cross" if "safe_to_cross" in df.columns else "safe_to_walk"
df[safe_col].value_counts(dropna=False).sort_index()
(df[safe_col].value_counts(normalize=True, dropna=False).sort_index() * 100).round(2)


safe_to_walk
0    53.4
1    46.6
Name: proportion, dtype: float64

In [5]:
pd.crosstab(df["subset"], df[safe_col], dropna=False)
(pd.crosstab(df["subset"], df[safe_col], normalize="index", dropna=False) * 100).round(2)


safe_to_walk,0,1
subset,Unnamed: 1_level_1,Unnamed: 2_level_1
reserved,100.0,0.0
test,59.65,40.35
train,49.48,50.52
val,49.12,50.88


In [6]:
candidate_labels = [
    "crosswalk", "pedestrian_signal", "traffic_light",
    "car", "scooter", "bike", "other_obstacles"
]
labels = [c for c in candidate_labels if c in df.columns]
labels


['crosswalk', 'traffic_light', 'car', 'scooter', 'bike', 'other_obstacles']

In [7]:
pos_rate = df[labels].mean(numeric_only=True).sort_values(ascending=False)
pos_rate
(pos_rate * 100).round(2)


traffic_light      130.40
crosswalk           99.91
car                 20.73
scooter              8.54
bike                 3.83
other_obstacles      0.70
dtype: float64

In [8]:
df.groupby("subset")[labels].mean(numeric_only=True).round(4)
(df.groupby("subset")[labels].mean(numeric_only=True) * 100).round(2)


Unnamed: 0_level_0,crosswalk,traffic_light,car,scooter,bike,other_obstacles
subset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
reserved,100.0,129.49,35.9,21.79,2.56,1.28
test,100.0,129.82,33.33,7.02,7.02,1.75
train,99.9,129.39,18.83,7.64,3.87,0.63
val,100.0,149.12,19.3,7.02,1.75,0.0


In [9]:
summary = {}
for c in labels + ["weather", safe_col, "roadway_width", "roadway_width_bin"]:
    if c in df.columns:
        summary[c] = {
            "n_unique": df[c].nunique(dropna=False),
            "unique_vals": sorted(df[c].dropna().unique().tolist())[:20]
        }
summary


{'crosswalk': {'n_unique': 2, 'unique_vals': [0, 1]},
 'traffic_light': {'n_unique': 3, 'unique_vals': [0, 1, 2]},
 'car': {'n_unique': 2, 'unique_vals': [0, 1]},
 'scooter': {'n_unique': 2, 'unique_vals': [0, 1]},
 'bike': {'n_unique': 2, 'unique_vals': [0, 1]},
 'other_obstacles': {'n_unique': 2, 'unique_vals': [0, 1]},
 'weather': {'n_unique': 3, 'unique_vals': [0, 1, 2]},
 'safe_to_walk': {'n_unique': 2, 'unique_vals': [0, 1]},
 'roadway_width': {'n_unique': 157,
  'unique_vals': [4.4,
   4.9,
   5.4,
   5.5,
   5.8,
   6.1,
   6.2,
   6.3,
   6.4,
   6.5,
   6.6,
   6.7,
   6.8,
   6.9,
   7.0,
   7.1,
   7.2,
   7.3,
   7.5,
   7.7]}}

In [10]:
width_col = "roadway_width" if "roadway_width" in df.columns else None
if width_col:
    print(df[width_col].describe())
    print(df[width_col].value_counts(dropna=False).head(20))


count    1148.000000
mean       19.716376
std         9.842450
min         4.400000
25%        11.100000
50%        18.500000
75%        27.200000
max        59.100000
Name: roadway_width, dtype: float64
roadway_width
23.0    38
20.0    36
16.7    25
18.5    24
31.2    23
9.2     23
36.3    20
16.6    19
14.1    19
32.2    18
11.8    17
33.4    16
32.6    16
10.3    15
7.8     15
14.2    14
31.6    14
31.9    14
8.8     14
26.5    13
Name: count, dtype: int64


In [11]:
if "roadway_width_bin" in df.columns:
    print(df["roadway_width_bin"].value_counts(dropna=False).sort_index())


In [12]:
df["subset"].value_counts()
(df["subset"].value_counts(normalize=True) * 100).round(2)


subset
train       83.28
reserved     6.79
val          4.97
test         4.97
Name: proportion, dtype: float64

In [14]:
missing_rate = (df.isna().mean() * 100).sort_values(ascending=False)
missing_rate.head(20).round(2)


Unnamed: 0                  0.0
path                        0.0
weather                     0.0
no_obstacle_in_crosswalk    0.0
other_obstacles             0.0
bike                        0.0
scooter                     0.0
car                         0.0
traffic_light               0.0
crosswalk_signal            0.0
crosswalk                   0.0
roadway_width               0.0
safe_to_walk                0.0
new_filename                0.0
original_filename           0.0
split_filename              0.0
filename                    0.0
subset                      0.0
dtype: float64

In [15]:
import os

file_col = "new_filename" if "new_filename" in df.columns else "filename"
missing_files = df[~df[file_col].apply(lambda f: os.path.isfile(os.path.join("processed_data", str(f))))]

len(missing_files), missing_files.head()


(0,
 Empty DataFrame
 Columns: [Unnamed: 0, path, filename, split_filename, original_filename, new_filename, safe_to_walk, roadway_width, crosswalk, crosswalk_signal, traffic_light, car, scooter, bike, other_obstacles, no_obstacle_in_crosswalk, weather, subset]
 Index: [])