In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Dataset from: https://www.kaggle.com/datasets/tejasvdante/pedestrian-no-pedestrian

## ***Dataset Collection***

In [None]:
!apt-get install -y unrar

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unrar is already the newest version (1:6.1.5-1ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
!unrar x "/content/drive/MyDrive/AAI3001/Project/data.rar" "/content/drive/MyDrive/AAI3001/Project/data/"


UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from /content/drive/MyDrive/AAI3001/Project/data.rar

Creating    /content/drive/MyDrive/AAI3001/Project/data               OK
Creating    /content/drive/MyDrive/AAI3001/Project/data/data          OK
Creating    /content/drive/MyDrive/AAI3001/Project/data/data/train    OK
Creating    /content/drive/MyDrive/AAI3001/Project/data/data/train/no pedestrian  OK
Extracting  /content/drive/MyDrive/AAI3001/Project/data/data/train/no pedestrian/train (1).jpg       0%  OK 
Extracting  /content/drive/MyDrive/AAI3001/Project/data/data/train/no pedestrian/train (10).jpg       0%  OK 
Extracting  /content/drive/MyDrive/AAI3001/Project/data/data/train/no pedestrian/train (100).jpg       0%  OK 
Extracting  /content/drive/MyDrive/AAI3001/Project/data/data/train/no pedestrian/train (101).jpg       0%  OK 
Extracting  /content/drive/MyDrive/AAI3001/Project/data/data/train/no pedestri

In [None]:
# Build CSV for training
import re
from pathlib import Path
import pandas as pd

BASE = Path("/content/drive/MyDrive/AAI3001/Project/data/train")
CSV_PATH = BASE / "labels.csv"

# We will accept either the correct names or the common misspellings, and normalize labels.
LABEL_DIR_ALIASES = {
    "pedestrian": "pedestrian",
    "pedestrain": "pedestrian",         # alias (misspelling)
    "no pedestrian": "no pedestrian",
    "no pedestrain": "no pedestrian",   # alias (misspelling)
}

IMG_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}

def natural_key(s: str):
    # Sort like human: img2 < img10
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r"(\d+)", s)]

assert BASE.exists(), f"Base path not found: {BASE}"

present_dirs = {p.name for p in BASE.iterdir() if p.is_dir()}
print("Found subfolders:", present_dirs)

rows = []
for dirname, normalized_label in LABEL_DIR_ALIASES.items():
    d = BASE / dirname
    if not d.exists():
        # skip aliases that aren't present
        continue

    # Recursively gather images (handles nested folders too)
    files = [p for p in d.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXT]
    files.sort(key=lambda p: natural_key(str(p.relative_to(BASE))))  # deterministic

    print(f"{dirname} → label '{normalized_label}': {len(files)} image(s)")
    for p in files:
        # Save filename only (as you first asked) AND the relative path in case you want it later
        rows.append({
            "image": p.name,
            "label": normalized_label,
            "rel_path": str(p.relative_to(BASE))  # e.g., 'pedestrian/img001.jpg'
        })

# Build DataFrame (only keep the two columns you need)
df = pd.DataFrame(rows)[["image", "label"]]
df.to_csv(CSV_PATH, index=False)
print(f"✅ Saved {len(df)} rows to {CSV_PATH}")

# Quick sanity check
try:
    from IPython.display import display
    display(df.head(10))
except:
    print(df.head(10))
print(df["label"].value_counts())

Found subfolders: {'no pedestrian', 'pedestrian'}
pedestrian → label 'pedestrian': 631 image(s)
no pedestrian → label 'no pedestrian': 631 image(s)
✅ Saved 1262 rows to /content/drive/MyDrive/AAI3001/Project/data/train/labels.csv


Unnamed: 0,image,label
0,pic1 (1).jpg,pedestrian
1,pic1 (2).jpg,pedestrian
2,pic1 (3).jpg,pedestrian
3,pic1 (4).jpg,pedestrian
4,pic1 (5).jpg,pedestrian
5,pic1 (6).jpg,pedestrian
6,pic1 (7).jpg,pedestrian
7,pic1 (8).jpg,pedestrian
8,pic1 (9).jpg,pedestrian
9,pic1 (10).jpg,pedestrian


label
pedestrian       631
no pedestrian    631
Name: count, dtype: int64


In [None]:
# Build CSV for validation
import re
from pathlib import Path
import pandas as pd

BASE = Path("/content/drive/MyDrive/AAI3001/Project/data/validation")
CSV_PATH = BASE / "labels.csv"

# We will accept either the correct names or the common misspellings, and normalize labels.
LABEL_DIR_ALIASES = {
    "pedestrian": "pedestrian",
    "pedestrain": "pedestrian",         # alias (misspelling)
    "no pedestrian": "no pedestrian",
    "no pedestrain": "no pedestrian",   # alias (misspelling)
}

IMG_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}

def natural_key(s: str):
    # Sort like human: img2 < img10
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r"(\d+)", s)]

assert BASE.exists(), f"Base path not found: {BASE}"

present_dirs = {p.name for p in BASE.iterdir() if p.is_dir()}
print("Found subfolders:", present_dirs)

rows = []
for dirname, normalized_label in LABEL_DIR_ALIASES.items():
    d = BASE / dirname
    if not d.exists():
        # skip aliases that aren't present
        continue

    # Recursively gather images (handles nested folders too)
    files = [p for p in d.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXT]
    files.sort(key=lambda p: natural_key(str(p.relative_to(BASE))))  # deterministic

    print(f"{dirname} → label '{normalized_label}': {len(files)} image(s)")
    for p in files:
        # Save filename only (as you first asked) AND the relative path in case you want it later
        rows.append({
            "image": p.name,
            "label": normalized_label,
            "rel_path": str(p.relative_to(BASE))  # e.g., 'pedestrian/img001.jpg'
        })

# Build DataFrame (only keep the two columns you need)
df = pd.DataFrame(rows)[["image", "label"]]
df.to_csv(CSV_PATH, index=False)
print(f"✅ Saved {len(df)} rows to {CSV_PATH}")

# Quick sanity check
try:
    from IPython.display import display
    display(df.head(10))
except:
    print(df.head(10))
print(df["label"].value_counts())

Found subfolders: {'no pedestrian', 'pedestrian'}
pedestrian → label 'pedestrian': 177 image(s)
no pedestrian → label 'no pedestrian': 177 image(s)
✅ Saved 354 rows to /content/drive/MyDrive/AAI3001/Project/data/validation/labels.csv


Unnamed: 0,image,label
0,val (1).jpg,pedestrian
1,val (2).jpg,pedestrian
2,val (3).jpg,pedestrian
3,val (4).jpg,pedestrian
4,val (5).jpg,pedestrian
5,val (6).jpg,pedestrian
6,val (7).jpg,pedestrian
7,val (8).jpg,pedestrian
8,val (9).jpg,pedestrian
9,val (10).jpg,pedestrian


label
pedestrian       177
no pedestrian    177
Name: count, dtype: int64
