In [2]:
import os, shutil, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

ROOT="/content/drive/MyDrive/food-10/food-10"
IMAGES_DIR= os.path.join(ROOT, "images")
TRAIN_TXT = os.path.join(ROOT, "train.txt")
TEST_TXT = os.path.join(ROOT, "test.txt")
OUT_DIR = os.path.join(ROOT, "prepared_splits")
os.makedirs(OUT_DIR, exist_ok=True)

print("ROOT:", ROOT)
print("Images folder exists:", os.path.exists(IMAGES_DIR))
print("train.txt exists:", os.path.exists(TRAIN_TXT))
print("test.txt exists:", os.path.exists(TEST_TXT))
print("Outputs will be saved to:", OUT_DIR)

Mounted at /content/drive
ROOT: /content/drive/MyDrive/food-10/food-10
Images folder exists: True
train.txt exists: True
test.txt exists: True
Outputs will be saved to: /content/drive/MyDrive/food-10/food-10/prepared_splits


In [3]:
#Cell 2— Here we read the 'train.txt' and create stratified 80/20 train/val split
#1) Read lines from train.txt
with open(TRAIN_TXT, "r") as f:
    lines = [l.strip() for l in f if l.strip()]
print("Total lines in original train.txt:", len(lines))
print("Sample lines:", lines[:6])

#2) Normalize paths and infer class if present in path
rows = []
for ln in lines:
    if "/" in ln:
        cls = ln.split("/")[0]
        relpath = ln
    else:
        cls = None
        relpath = ln
    rows.append({"path": relpath, "class": cls})

df = pd.DataFrame(rows)

# 3) If class is None, try to infer by scanning images folder (works if filenames unique)
if df['class'].isnull().any():
    mapping = {}
    if os.path.exists(IMAGES_DIR):
        for c in os.listdir(IMAGES_DIR):
            cdir = os.path.join(IMAGES_DIR, c)
            if not os.path.isdir(cdir):
                continue
            for fname in os.listdir(cdir):
                # if duplicate filenames across classes exist, last one will overwrite — we'll check later
                mapping[fname] = c
    def infer_class(row):
        if pd.isna(row['class']):
            fn = os.path.basename(row['path'])
            return mapping.get(fn, None)
        return row['class']
    df['class'] = df.apply(infer_class, axis=1)

# 4) If any classes still unknown, print a short sample to inspect
if df['class'].isnull().any():
    print("Warning: some entries could not infer class automatically. Inspect these (sample):")
    display(df[df['class'].isnull()].head(30))
    # You can manually fix these entries in train.txt or move images/filenames to be unique.

#5) Stratified split 80/20 (deterministic seed)
df_clean = df.dropna(subset=['class']).reset_index(drop=True)
seed = 42
train_df, val_df = train_test_split(df_clean, test_size=0.20, random_state=seed, stratify=df_clean['class'])

print(f"After cleaning: total={len(df_clean)}; train={len(train_df)}; val={len(val_df)}")
print("Example train class counts:")
print(train_df['class'].value_counts().head())
print("Example val class counts:")
print(val_df['class'].value_counts().head())

#6) Save outputs (both split TXT and CSV)
train_txt_out = os.path.join(OUT_DIR, "train_split.txt")
val_txt_out   = os.path.join(OUT_DIR, "val_split.txt")
train_csv_out = os.path.join(OUT_DIR, "train_split.csv")
val_csv_out   = os.path.join(OUT_DIR, "val_split.csv")

train_df['path'].to_csv(train_txt_out, index=False, header=False)
val_df['path'].to_csv(val_txt_out, index=False, header=False)
train_df.to_csv(train_csv_out, index=False)
val_df.to_csv(val_csv_out, index=False)

print("Saved split files to:", OUT_DIR)

Total lines in original train.txt: 7500
Sample lines: ['cannoli/1007970', 'cannoli/1008331', 'cannoli/1015289', 'cannoli/1019714', 'cannoli/1021048', 'cannoli/1029395']
After cleaning: total=7500; train=6000; val=1500
Example train class counts:
class
onion_rings      600
frozen_yogurt    600
ravioli          600
gnocchi          600
crab_cakes       600
Name: count, dtype: int64
Example val class counts:
class
ceviche                150
spaghetti_bolognese    150
cannoli                150
crab_cakes             150
onion_rings            150
Name: count, dtype: int64
Saved split files to: /content/drive/MyDrive/food-10/food-10/prepared_splits


In [4]:
#Cell 3— Here we build 'df_all' by scanning images/ and create label mapping
#Helper: read txt (some txt may have lines like class/image.jpg or just filenames)
def read_list_file(path):
    with open(path,'r') as f:
        lines = [l.strip() for l in f if l.strip()]
    return lines

# Read original train/test lists (for debug/info only)
orig_train_list = read_list_file(TRAIN_TXT) if os.path.exists(TRAIN_TXT) else []
orig_test_list  = read_list_file(TEST_TXT)  if os.path.exists(TEST_TXT) else []

print("Original train.txt sample:", orig_train_list[:5])
print("Original test.txt sample :", orig_test_list[:5])

# Create dataframe of all images by walking images folder
rows = []
if os.path.exists(IMAGES_DIR):
    for cls in sorted(os.listdir(IMAGES_DIR)):
        cls_dir = os.path.join(IMAGES_DIR, cls)
        if not os.path.isdir(cls_dir):
            continue
        for fname in os.listdir(cls_dir):
            if fname.lower().endswith(('.jpg','.jpeg','.png')):
                rel = os.path.join(cls, fname)  # relative path from images
                rows.append({"path": rel, "class": cls, "fullpath": os.path.join(cls_dir, fname)})
else:
    raise FileNotFoundError(f"images folder not found at {IMAGES_DIR}")

df_all = pd.DataFrame(rows)
print("Total images found (by scanning images/):", len(df_all))
display(df_all.head())

# Create label->index mapping
classes = sorted(df_all['class'].unique())
cls2idx = {c:i for i,c in enumerate(classes)}
df_all['label'] = df_all['class'].map(cls2idx)

# Sanity check: ensure all paths in splits exist in df_all
split_csv_dir = OUT_DIR
missing_in_images = []
for p in pd.concat([train_df['path'], val_df['path']]).unique():
    if p not in set(df_all['path']):
        missing_in_images.append(p)
if missing_in_images:
    print("Warning: the following split entries were NOT found inside images/ (sample up to 10):")
    print(missing_in_images[:10])
else:
    print("All split paths found inside images/✅")

#(Optional) Also save final train.csv/val.csv that include fullpath & labels for convenience
train_full = df_all[df_all['path'].isin(train_df['path'])].reset_index(drop=True)
val_full   = df_all[df_all['path'].isin(val_df['path'])].reset_index(drop=True)
train_full.to_csv(os.path.join(OUT_DIR, "train.csv"), index=False)
val_full.to_csv(os.path.join(OUT_DIR, "val.csv"), index=False)
print("Saved train.csv and val.csv (with fullpath & label) to", OUT_DIR)

Original train.txt sample: ['cannoli/1007970', 'cannoli/1008331', 'cannoli/1015289', 'cannoli/1019714', 'cannoli/1021048']
Original test.txt sample : ['cannoli/1087676', 'cannoli/109727', 'cannoli/1102569', 'cannoli/1118168', 'cannoli/1127495']
Total images found (by scanning images/): 20015


Unnamed: 0,path,class,fullpath
0,beef_tartare/903105.jpg,beef_tartare,/content/drive/MyDrive/food-10/food-10/images/...
1,beef_tartare/750123.jpg,beef_tartare,/content/drive/MyDrive/food-10/food-10/images/...
2,beef_tartare/911248.jpg,beef_tartare,/content/drive/MyDrive/food-10/food-10/images/...
3,beef_tartare/721039.jpg,beef_tartare,/content/drive/MyDrive/food-10/food-10/images/...
4,beef_tartare/516546.jpg,beef_tartare,/content/drive/MyDrive/food-10/food-10/images/...


['onion_rings/2032387', 'frozen_yogurt/2409853', 'frozen_yogurt/1772388', 'ravioli/1875488', 'gnocchi/2155707', 'gnocchi/724766', 'frozen_yogurt/1881868', 'frozen_yogurt/1973425', 'crab_cakes/1262425', 'frozen_yogurt/1795439']
Saved train.csv and val.csv (with fullpath & label) to /content/drive/MyDrive/food-10/food-10/prepared_splits


In [5]:
#Repair block for train.csv / val.csv by matching split entries to scanned images even when split lacks extensions.
import os, pandas as pd
ROOT="/content/drive/MyDrive/food-10/food-10"
IMAGES_DIR=os.path.join(ROOT, "images")
SPLIT_DIR=os.path.join(ROOT, "prepared_splits")
#load what we have
train_split=pd.read_csv(os.path.join(SPLIT_DIR,"train_split.csv"),dtype=str,header=0)
val_split=pd.read_csv(os.path.join(SPLIT_DIR,"val_split.csv"),dtype=str,header=0)

#df_all from scanned images (recreate here to be safe)
rows=[]
for cls in sorted(os.listdir(IMAGES_DIR)):
    cls_dir = os.path.join(IMAGES_DIR, cls)
    if not os.path.isdir(cls_dir): continue
    for fname in os.listdir(cls_dir):
        if fname.lower().endswith(('.jpg','.jpeg','.png')):
            rel = os.path.join(cls, fname)
            rows.append({"path": rel, "class": cls, "fullpath": os.path.join(cls_dir, fname)})
df_all = pd.DataFrame(rows)

# create a no-extension key for df_all: "class/basename_without_ext"
df_all['no_ext_key'] = df_all['class'] + "/" + df_all['path'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

# also normalize train_split values to strings and strip whitespace
train_split['path'] = train_split['path'].astype(str).str.strip()
val_split['path']   = val_split['path'].astype(str).str.strip()

# For each split row, find matching rows in df_all by comparing no_ext_key == split_path
def build_full_df(split_df, df_all):
    matched_rows = []
    missing = []
    df_all_index = df_all.set_index('no_ext_key')
    for p in split_df['path'].tolist():
        if p in df_all_index.index:
            matched_rows.append(df_all_index.loc[p])
        else:
            # try tolerant match: maybe split included extension by mistake, or extra prefix
            possible = df_all[df_all['path'].str.startswith(p)]
            if len(possible) == 1:
                matched_rows.append(possible.iloc[0])
            elif len(possible) > 1:
                # ambiguous: pick the first but log it
                matched_rows.append(possible.iloc[0])
            else:
                missing.append(p)
    if matched_rows:
        # matched_rows may be a list of Series; concat into DataFrame
        full_df = pd.DataFrame(matched_rows).reset_index(drop=True)
    else:
        full_df = pd.DataFrame(columns=df_all.columns)
    return full_df, missing

train_full, missing_train = build_full_df(train_split, df_all)
val_full,   missing_val   = build_full_df(val_split, df_all)

print("Train matches:", len(train_full), "Val matches:", len(val_full))
print("Missing in train (sample up to 20):", missing_train[:20])
print("Missing in val (sample up to 20):", missing_val[:20])

# Save repaired CSVs
OUT_DIR = SPLIT_DIR  # reuse prepared_splits
train_full.to_csv(os.path.join(OUT_DIR, "train.csv"), index=False)
val_full.to_csv(os.path.join(OUT_DIR, "val.csv"), index=False)
print("Saved repaired train.csv and val.csv to", OUT_DIR)

# Spot check a few sample existence checks using fullpath
import os
for df_name, df in [("train", train_full), ("val", val_full)]:
    if len(df) > 0:
        fp = df['fullpath'].iloc[0]
        print(f"{df_name} sample fullpath exists? ->", fp, os.path.exists(fp))
    else:
        print(f"{df_name} is empty after repair.")

Train matches: 6000 Val matches: 1500
Missing in train (sample up to 20): []
Missing in val (sample up to 20): []
Saved repaired train.csv and val.csv to /content/drive/MyDrive/food-10/food-10/prepared_splits
train sample fullpath exists? -> /content/drive/MyDrive/food-10/food-10/images/onion_rings/2032387.jpg True
val sample fullpath exists? -> /content/drive/MyDrive/food-10/food-10/images/ceviche/1124336.jpg True


In [6]:
#Adding this fix: Adding a label column to train.csv and val.csv after repair cell.
import pandas as pd
import os
ROOT="/content/drive/MyDrive/food-10/food-10"
CSV_DIR=os.path.join(ROOT,"prepared_splits")
train_full=pd.read_csv(os.path.join(CSV_DIR,"train.csv"))
val_full=pd.read_csv(os.path.join(CSV_DIR,"val.csv"))
#get sorted class list
classes=sorted(train_full['class'].unique())
cls2idx={c:i for i,c in enumerate(classes)}
#create label column
train_full['label']=train_full['class'].map(cls2idx)
val_full['label']=val_full['class'].map(cls2idx)
#save back
train_full.to_csv(os.path.join(CSV_DIR,"train.csv"),index=False)
val_full.to_csv(os.path.join(CSV_DIR,"val.csv"),index=False)
print("Label column added to train.csv and val.csv")
print(train_full.head())

Label column added to train.csv and val.csv
                        path          class  \
0    onion_rings/2032387.jpg    onion_rings   
1  frozen_yogurt/2409853.jpg  frozen_yogurt   
2  frozen_yogurt/1772388.jpg  frozen_yogurt   
3        ravioli/1875488.jpg        ravioli   
4        gnocchi/2155707.jpg        gnocchi   

                                            fullpath  label  
0  /content/drive/MyDrive/food-10/food-10/images/...      6  
1  /content/drive/MyDrive/food-10/food-10/images/...      3  
2  /content/drive/MyDrive/food-10/food-10/images/...      3  
3  /content/drive/MyDrive/food-10/food-10/images/...      8  
4  /content/drive/MyDrive/food-10/food-10/images/...      4  
