# Data Processing V3

### Build and Run Dataset Downloader

### This preprocessing pipeline uses createDerivation.py

1. Build the "paralells" and run the program to download and convert the dataset:

In [1]:
!cd sub/paralells ; cargo build --release ; ./target/release/paralells --inputfolder "input (new data)" --outputfile result.csv

[0m [0m[0m[1m[38;5;12m--> [0m[0msrc/main.rs:2:5[0m
[0m  [0m[0m[1m[38;5;12m|[0m
[0m[1m[38;5;12m2[0m[0m [0m[0m[1m[38;5;12m|[0m[0m [0m[0muse csv::StringRecord;[0m
[0m  [0m[0m[1m[38;5;12m|[0m[0m     [0m[0m[1m[33m^^^^^^^^^^^^^^^^^[0m
[0m  [0m[0m[1m[38;5;12m|[0m
[0m  [0m[0m[1m[38;5;12m= [0m[0m[1mnote[0m[0m: `#[warn(unused_imports)]` on by default[0m

[0m [0m[0m[1m[38;5;12m--> [0m[0msrc/main.rs:3:5[0m
[0m  [0m[0m[1m[38;5;12m|[0m
[0m[1m[38;5;12m3[0m[0m [0m[0m[1m[38;5;12m|[0m[0m [0m[0muse http::Request;[0m
[0m  [0m[0m[1m[38;5;12m|[0m[0m     [0m[0m[1m[33m^^^^^^^^^^^^^[0m

[0m [0m[0m[1m[38;5;12m--> [0m[0msrc/main.rs:6:25[0m
[0m  [0m[0m[1m[38;5;12m|[0m
[0m[1m[38;5;12m6[0m[0m [0m[0m[1m[38;5;12m|[0m[0m [0m[0muse reqwest::blocking::{self, Client};[0m
[0m  [0m[0m[1m[38;5;12m|[0m[0m                         [0m[0m[1m[33m^^^^[0m

[0m  [0m[0m[1m[38;5;12m--> [0m[0m

Paralells is a high-speed Rust-based processing tool that can be used to parallelize tasks such as downloading and converting datasets.
More specifically, it downloads all images from CSV and converts them in parallel to PNG format.

2. Use YOLOv11-seg to segment the dataset by apples (in place of avocados), and bananas, and crop the images to only contain the avocados and bananas.
3. Apply unsharp mask to the cropped images to enhance the edges.
4. Save the modifed images as a new derivation using createDerivation.py.



### Use YOLOv11-seg to Segment and Crop Images

In [2]:
%run createDerivation.py

import cv2
import os
import numpy as np
import pandas as pd
from ultralytics import YOLO

INPUT_FOLDER = "./sub/paralells/steps/convertedtemp/" 
OUTPUT_FOLDER = "processed_images" 
TARGET_CLASSES = ['banana', 'apple', 'avocado'] 
FINAL_SIZE = (720, 720) 
MODEL_NAME = 'yolo11x-seg.pt' 

# read form paralells csv
all_df = pd.read_csv("./sub/paralells/result.csv")

def keep_all(path): return True

target_df = all_df

# Ensure required globals exist
assert 'MODEL_NAME' in globals(), "MODEL_NAME not defined (from earlier YOLO cell)."
assert 'FINAL_SIZE' in globals(), "FINAL_SIZE not defined (from earlier YOLO cell)."
assert 'TARGET_CLASSES' in globals(), "TARGET_CLASSES not defined (from earlier YOLO cell)."
assert 'create_dataset_variation' in globals(), "create_dataset_variation not available."

print(f"Loading YOLO model: {MODEL_NAME} ...")

try:
    _yolo_model = YOLO(MODEL_NAME)
except Exception as e:
    raise RuntimeError(f"Failed to load YOLO model '{MODEL_NAME}': {e}")

_target_classes = set(TARGET_CLASSES)

def _best_square_crop_from_results(image, results, names):
    img_h, img_w = image.shape[:2]
    best_detection = None
    highest_conf = -1.0

    for result in results:
        if result.boxes is None:
            continue
        for box in result.boxes:
            class_id = int(box.cls[0])
            class_name = names.get(class_id, str(class_id))
            if class_name not in _target_classes:
                continue
            conf = float(box.conf[0])
            if conf > highest_conf:
                highest_conf = conf
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().tolist()
                best_detection = (x1, y1, x2, y2)

    if best_detection is None:
        return None

    x1, y1, x2, y2 = best_detection
    cx = int((x1 + x2) / 2)
    cy = int((y1 + y2) / 2)
    dist_to_top = cy
    dist_to_bottom = img_h - cy
    dist_to_left = cx
    dist_to_right = img_w - cx
    half_size = int(min(dist_to_top, dist_to_bottom, dist_to_left, dist_to_right))
    if half_size <= 0:
        return None
    return (
        max(0, cx - half_size),
        max(0, cy - half_size),
        min(img_w, cx + half_size),
        min(img_h, cy + half_size),
    )

print("PHASE 1: YOLO Detection and Crop Region Precomputation")
# Precompute crop regions for all valid images in base_df
print(f"Scanning {len(target_df)} images for target objects: {sorted(_target_classes)}")
_crop_index = {}
_skipped = 0
for i, row in target_df.iterrows():
    p = row["file_path"]
    if not isinstance(p, str) or not os.path.isfile(p):
        _skipped += 1
        continue
    img = cv2.imread(p)
    if img is None:
        _skipped += 1
        continue
    try:
        results = _yolo_model(img, verbose=False)
        crop_box = _best_square_crop_from_results(img, results, _yolo_model.names)
        if crop_box is not None:
            _crop_index[p] = crop_box
    except Exception:
        _skipped += 1

print(f"Found {len(_crop_index)} images with target detections. Skipped: {_skipped}")

def yolo_filter_fn(path: str) -> bool:
    return path in _crop_index

def yolo_crop_map_fn(path: str) -> bytes:
    img = cv2.imread(path)
    if img is None:
        raise ValueError(f"Could not read image: {path}")
    x1, y1, x2, y2 = _crop_index.get(path, (0, 0, img.shape[1], img.shape[0]))
    cropped = img[y1:y2, x1:x2]
    if cropped.size == 0:
        cropped = img  # fallback to original if something went wrong
    resized = cv2.resize(cropped, FINAL_SIZE, interpolation=cv2.INTER_AREA)
    ok, buf = cv2.imencode(".png", resized)
    if not ok:
        raise RuntimeError("Failed to encode cropped image.")
    return buf.tobytes()

variation_tag = f"yolo_crop_{MODEL_NAME}_to_{FINAL_SIZE[0]}x{FINAL_SIZE[1]}"
step1_path = create_dataset_variation(target_df, yolo_filter_fn, yolo_crop_map_fn, variation_tag=variation_tag)
print("New variation CSV:", step1_path)
f = open("step1.txt", "w")
f.write(step1_path)
f.close()



Loading YOLO model: yolo11x-seg.pt ...
PHASE 1: YOLO Detection and Crop Region Precomputation
Scanning 5431 images for target objects: ['apple', 'avocado', 'banana']
Found 3805 images with target detections. Skipped: 0
Wrote 3826 rows to /home/fadhlan/Normal2/DeepLearningRepo/steps/variations/var_8d36ba17d2f23fc8/8d36ba17d2f23fc8.csv (variation 8d36ba17d2f23fc8)
New variation CSV: /home/fadhlan/Normal2/DeepLearningRepo/steps/variations/var_8d36ba17d2f23fc8/8d36ba17d2f23fc8.csv
