# Search through original OLM csv data to look for under rep class examples

In [72]:
import pandas as pd
import os

# Load the Excel file
df = pd.read_excel('~/Desktop/litter-imagery-brand-recognition/data/final_output_with_images.xlsx')

# Initialize a column for training data inclusion
df['training_data'] = False

# Get a list of all filenames in the training directory without file extensions
train_files = {f.split('.')[0] for f in os.listdir(train_images_path)}

# Update the 'training_data' column based on presence in the training directory
for index, row in df.iterrows():
    path = row['image_url'].split('https://olm-s3.s3.eu-west-1.amazonaws.com/')[-1].split('.')[0].replace('/', '_')
    if path in train_files:
        df.at[index, 'training_data'] = True

# Save the updated DataFrame back to Excel
df.to_excel('~/Desktop/litter-imagery-brand-recognition/data/final_output_with_images.xlsx', index=False)

# Using OLM CSV data & text mining results, determine new photos to add to dataset

In [60]:
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords

google_texts = pd.read_csv('./google_api.csv')
google_texts['image_url'] = google_texts['s3_path'].str.replace('s3://olm-pics-s3/', 'https://olm-s3.s3.eu-west-1.amazonaws.com/', regex=False)

# Helper function to clean text
def clean_text(texts):
    filtered_texts = []
    for text in texts:
        text = re.sub('([a-z0-9])([A-Z])', r'\1 \2', text).lower()
        cleaned_text = re.sub('[^a-zA-Z]', ' ', text)
        words = cleaned_text.split()
        words = [word for word in words if word.lower() not in stop_words and len(word) > 1]
        filtered_texts.extend(words)
    return set(filtered_texts)  # Return a set to facilitate subset checks

import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords

# Load the Excel file
df = pd.read_excel('~/Desktop/litter-imagery-brand-recognition/data/final_output_with_images.xlsx')
df = df.merge(google_texts, on='image_url', how='left')

# Define the directory path containing training images
train_images_path = '/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition_Final.v2i.yolov8/train/images'

# Get a list of all filenames in the training directory without file extensions
train_files = {f.split('.')[0] for f in os.listdir(train_images_path)}

# Predefined classes
brand_names = [
    "sprite", "lays", "fanta", "dr_pepper", "dunkin", "nestle_pure_life", 
    "niagara", "philip_morris", "cheetos", "miller_light", "heinz", 
    "aquafina", "fireball", "amstel_light", "burger_king", "dasani", 
    "mountain_dew", "white_claw", "great_value", "powerade", "reeses", 
    "canada_dry", "milwaukees_best", "snickers", "busch_light", "mikes_hard", 
    "fritos", "dos_equis", "hill_country_fare", "wendys", "natural_ice", 
    "brisk", "funyuns", "koolaid", "tropicana", "wawa", "blue_moon", 
    "panda_express", "popeyes", "ruffles", "nos", "starburst", "big_gulp", 
    "bodyarmor", "caprisun", "cheezit", "kitkat", "ozarka", "poland_spring", 
    "solo", "truly", "jarritos", "quick_trip", "target", "welchs"
]

# Download NLTK stop words the first time
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Clean and standardize class names
classes = [clean_text([cl]) for cl in brand_names]

# Initialize the 'training_data' and 'is_in_classes' columns
df['training_data'] = False
df['is_in_classes'] = False

# Update 'training_data'
for index, row in df.iterrows():
    path = row['image_url'].split('https://olm-s3.s3.eu-west-1.amazonaws.com/')[-1].split('.')[0].replace('/', '_')
    if path in train_files:
        df.at[index, 'training_data'] = True

# Update 'is_in_classes' with the new subset match logic
for index, row in df.iterrows():
    detected_brands = set()
    for col in [col for col in df.columns[125:229] if row[col] > 0]:  # Assuming these are brand columns
        detected_brands.update(clean_text([col]))
    for tag in ['custom_tag_1', 'custom_tag_2', 'custom_tag_3', 'texts']:
        if pd.notna(row[tag]):
            detected_brands.update(clean_text([row[tag]]))

    # Check if any brand class is a subset of detected brands
    if any(cl.issubset(detected_brands) for cl in classes) and not row['training_data']:
        df.at[index, 'is_in_classes'] = True
        print(f"Matching Brands: {detected_brands}, Image URL: {row['image_url']}")
    elif any(cl.issubset(detected_brands) for cl in [{''.join(org_cl)} for org_cl in classes]) and not row['training_data']:
        df.at[index, 'is_in_classes'] = True
        print(f"Matching Brands: {detected_brands}, Image URL: {row['image_url']}")

# Filter and output relevant images
relevant_images = df[df['is_in_classes'] & ~df['training_data']]['image_url'].tolist()
print(relevant_images)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickjohnson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Matching Brands: {'body', 'raspberry', 'blue', 'armor', 'bodyarmor', 'brand'}, Image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2022/06/08/PGE8AsgVdBek8nFRF6chQNgB7QfwqzBsdEikKq4z.jpg
Matching Brands: {'runs', 'com', 'beverage', 'dunkin', 'ca', 'donuts', 'extremely', 'hot'}, Image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2018/11/10/ejSEpc9T61mo6Iv45VLhhN06TJKNuykr6nfeLBoT.jpeg
Matching Brands: {'ng', 'eed', 'ies', 'burgerking'}, Image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2019/02/04/y5hnxzSo193EzkmeSg0e79yWBiKGUHjBl7SOBp1t.jpeg
Matching Brands: {'cardboard', 'paper', 'caprisun', 'packaging'}, Image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2023/03/05/IvNHhxS62ik8tn3vldNGjWDgwiMLvvHhFtCMNN2R.jpg
Matching Brands: {'jarritos', 'brand'}, Image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2023/03/05/ZASE6hlok465o6JAErnQwvXbSwko7E3q2ZPoMws2.jpg
Matching Brands: {'heinz', 'kraft', 'brand'}, Image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2023/03/15/pZsGK5XL0Tvpy

# Download & Crop Images

In [62]:
import os
import requests
from PIL import Image
import imageio.v3 as iio

def download_image(url, download_path):
    """Download an image from a URL to a specified path."""
    response = requests.get(url)
    if response.status_code == 200:
        with open(download_path, 'wb') as f:
            f.write(response.content)

def convert_heic_to_jpeg(heic_path, target_path):
    """Convert a HEIC file to a JPEG file, removing the original."""
    img = iio.imread(heic_path)
    image = Image.fromarray(img)
    image.save(target_path, format="JPEG")
    os.remove(heic_path)  # Clean up the original HEIC file
    return target_path

def resize_image(image_path, width=640):
    """Resize an image to a given width while maintaining aspect ratio."""
    with Image.open(image_path) as img:
        # Calculate new height to maintain aspect ratio
        w_percent = (width / float(img.size[0]))
        h_size = int((float(img.size[1]) * float(w_percent)))
        img = img.resize((width, h_size), Image.Resampling.LANCZOS)
        img.save(image_path)  # Overwrite the resized image

def extract_s3_key(url):
    """Extract the S3 key from the URL to use in the filename."""
    parts = url.split('/')
    domain_index = parts.index('olm-s3.s3.eu-west-1.amazonaws.com') + 1
    return '_'.join(parts[domain_index:])

# Download and process each image
for i, url in enumerate(relevant_images):
    s3_key = extract_s3_key(url)
    extension = url.rsplit('.', 1)[-1].lower()
    file_name = f"{s3_key.split('.')[0]}.{extension}"
    download_path = os.path.join('./new_photos', file_name)  # Construct full path with new filename

    try:
        download_image(url, download_path)
        
        # Check if the downloaded image is a HEIC file
        if download_path.lower().endswith('.heic'):
            download_path = convert_heic_to_jpeg(download_path, download_path.replace('.heic', '.jpg'))
        
        # Resize the image
        resize_image(download_path)
    except Exception as e:
        print(f"Error processing image {url}: {e}")

# Split into train, val, test

In [237]:
import os
import numpy as np
import yaml
from skmultilearn.model_selection import iterative_train_test_split

# Load YAML configuration
with open('/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition.v4i.yolov8/data.yaml', 'r') as yamlfile:
    cfg = yaml.safe_load(yamlfile)

# Extract class names and paths
classes = cfg['names']
train_images_path = '/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition.v4i.yolov8/train/images'
label_dir = '/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition.v4i.yolov8/train/labels'

# Initialize label matrix
image_files = [f for f in os.listdir(train_images_path) if f.endswith('.jpg')]
Y = np.zeros((len(image_files), len(classes)), dtype=int)
image_features = []

# Populate Y matrix from labels
for idx, image in enumerate(image_files):
    label_path = os.path.join(label_dir, image.replace('.jpg', '.txt'))
    image_features.append(image)
    if os.path.exists(label_path):
        with open(label_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                class_id = int(line.split()[0])
                Y[idx, class_id] = 1

# Split dataset into training, validation, and test sets
X_filtered = np.array(image_features)
X_train, Y_train, X_temp, Y_temp = iterative_train_test_split(X_filtered.reshape(-1, 1), Y, test_size=0.2)
X_val, Y_val, X_test, Y_test = iterative_train_test_split(X_temp, Y_temp, test_size=0.5)

required_annotations = 10
test_class_counts = np.sum(Y_test, axis=0)

print("Initial test class counts:", test_class_counts)

# Function to dynamically calculate scores for each image
def get_image_scores(Y_data, test_class_counts):
    scores = []
    for y in Y_data:
        score = sum(1 if y[i] == 1 and test_class_counts[i] < required_annotations else -0.75 if y[i] == 1 and test_class_counts[i] >= required_annotations else 0 for i in range(len(y)))
        scores.append(score)
    return scores

# Dynamic optimization to minimize changes
image_moved_count = 0
while any(test_class_counts < required_annotations):
    # Update scores for train and val
    train_scores = get_image_scores(Y_train, test_class_counts)
    val_scores = get_image_scores(Y_val, test_class_counts)
    
    # Find best candidate from train and val
    train_idx = max(range(len(train_scores)), key=lambda idx: train_scores[idx], default=-1)
    val_idx = max(range(len(val_scores)), key=lambda idx: val_scores[idx], default=-1)
    
    if train_scores[train_idx] >= val_scores[val_idx]:
        selected_idx = train_idx
        selected_train = True
    else:
        selected_idx = val_idx
        selected_train = False
    
    # Check if any class needs the image
    if selected_idx != -1 and any((Y_train if selected_train else Y_val)[selected_idx, i] == 1 and test_class_counts[i] < required_annotations for i in range(num_classes)):
        # Move the selected image to test
        if selected_train:
            Y_test = np.vstack([Y_test, Y_train[selected_idx]])
            X_test = np.vstack([X_test, X_train[selected_idx]])
            Y_train = np.delete(Y_train, selected_idx, axis=0)
            X_train = np.delete(X_train, selected_idx, axis=0)
        else:
            Y_test = np.vstack([Y_test, Y_val[selected_idx]])
            X_test = np.vstack([X_test, X_val[selected_idx]])
            Y_val = np.delete(Y_val, selected_idx, axis=0)
            X_val = np.delete(X_val, selected_idx, axis=0)
        
        image_moved_count += 1
        # Recalculate test class counts and scores
        test_class_counts = np.sum(Y_test, axis=0)
        print(f"Iteration {image_moved_count}, Moved image: {'train' if selected_train else 'val'}, Updated test class counts: {test_class_counts}")

        if all(test_class_counts >= required_annotations):
            print("All classes have met the minimum required annotations.")
            break

# Flatten arrays for consistency
X_train = X_train.ravel()
X_val = X_val.ravel()
X_test = X_test.ravel()

Initial test class counts: [ 2  4 12  4  4  3 18  6 22  3  7  7  3  4  4  2 10  2  2 21  4  6  4  7
  3  9  4  3  7  8  2  4  3  3  4  9  3 10  1  7  2  5  7  4  1]
Iteration 1, Moved image: train, Updated test class counts: [ 2  4 13  4  4  3 19  6 22  3  7  7  3  4  4  2 10  2  2 21  4  6  4  8
  4  9  4  3  7  9  3  4  3  3  4  9  3 10  1  7  2  6  7  5  1]
Iteration 2, Moved image: val, Updated test class counts: [ 2  4 13  4  4  3 19  6 22  3  8  7  3  4  4  2 11  2  2 21  4  6  4  8
  5  9  4  3  7 10  3  4  4  3  4  9  3 10  1  7  2  6  7  5  2]
Iteration 3, Moved image: train, Updated test class counts: [ 2  4 13  4  4  3 19  6 22  3  8  7  3  4  4  2 11  2  2 21  4  7  4  8
  5 10  4  3  7 10  3  4  4  3  4 10  3 10  1  7  2  6  8  5  2]
Iteration 4, Moved image: val, Updated test class counts: [ 2  4 13  4  4  3 19  6 22  3  8  7  3  4  4  2 11  2  2 21  4  7  4  8
  6 10  4  3  7 10  3  4  4  3  4 10  4 10  1  7  2  6  9  6  2]
Iteration 5, Moved image: train, Updated test c

In [239]:
X_val.shape

(160,)

In [240]:
X_test.shape

(316,)

In [241]:
import os
import shutil

# Define base paths
base_dir = '/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition.v4i.yolov8'
train_images_path = os.path.join(base_dir, 'train/images')
train_labels_path = os.path.join(base_dir, 'train/labels')

# Define new structure paths
new_train_images_path = os.path.join(base_dir, 'train/images')
new_train_labels_path = os.path.join(base_dir, 'train/labels')
new_val_images_path = os.path.join(base_dir, 'val/images')
new_val_labels_path = os.path.join(base_dir, 'val/labels')
new_test_images_path = os.path.join(base_dir, 'test/images')
new_test_labels_path = os.path.join(base_dir, 'test/labels')

# Create new directories if they do not exist
for path in [new_train_images_path, new_train_labels_path, new_val_images_path, new_val_labels_path, new_test_images_path, new_test_labels_path]:
    if not os.path.exists(path):
        os.makedirs(path)

# Function to move files
def move_files(file_list, src_dir, dest_dir):
    for file_name in file_list:
        src_path = os.path.join(src_dir, file_name)
        dest_path = os.path.join(dest_dir, file_name)
        shutil.move(src_path, dest_path)

# Move training files
move_files(X_train, train_images_path, new_train_images_path)
move_files([f.replace('.jpg', '.txt') for f in X_train], train_labels_path, new_train_labels_path)

# Move validation files
move_files(X_val, train_images_path, new_val_images_path)
move_files([f.replace('.jpg', '.txt') for f in X_val], train_labels_path, new_val_labels_path)

# Move test files
move_files(X_test, train_images_path, new_test_images_path)
move_files([f.replace('.jpg', '.txt') for f in X_test], train_labels_path, new_test_labels_path)

print("Files have been reorganized successfully.")

Files have been reorganized successfully.


In [1]:
import albumentations
import cv2
import csv
import numpy as np
import random

# Image and Label Path
img_name = '112_Gq5m3KeQJUFNOCzkltNk9t92ucUgueWt9dVVTxUk_jpg.rf.24dd075073cdcefce324a6468c515818'
image_path = '/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition.v4i.yolov8/train/images/'+img_name+'.jpg'
image = cv2.imread(image_path)
img_height, img_width, _ = image.shape

def get_bb_params(img_name):
    bb_output = []
    with open('/Users/nickjohnson/downloads/Capstone_OLM_Logo_Recognition.v4i.yolov8/train/labels/'+img_name+'.txt', 'r') as fd:
        reader = csv.reader(fd)
        for bounding_boxes in reader:
            bounding_boxes = bounding_boxes[0].split(" ")
            bounding_boxes = [float(x) for x in bounding_boxes]
            class_id, center_x, center_y, bbox_width, bbox_height = bounding_boxes

            l = (center_x - bbox_width / 2) * img_width
            r = (center_x + bbox_width / 2) * img_width
            t = (center_y - bbox_height / 2) * img_height
            b = (center_y + bbox_height / 2) * img_height

            bb_output.append([l, t, r, b, class_id])
    return bb_output

def apply_augmentation(transformation, image, bb_params):
    transformed = transformation(image=image, bboxes=bb_params, class_labels=[int(x[-1]) for x in bb_params])
    transformed_image = transformed['image']
    transformed_bboxes = transformed['bboxes']

    for box in transformed_bboxes:
        l, t, r, b, class_id = box
        cv2.rectangle(transformed_image, (int(l), int(t)), (int(r), int(b)), (255, 0, 0), 2)
    
    return transformed_image

# Define the augmentation pipeline
rotation_degrees = random.randint(45, 270)
transform = albumentations.Compose([
    albumentations.HorizontalFlip(p=0.5),
    albumentations.RandomBrightnessContrast(p=0.5),
    albumentations.Rotate(limit=(rotation_degrees, rotation_degrees), p=1),
], bbox_params=albumentations.BboxParams(format='pascal_voc', label_fields=['class_labels']))

# Load bounding box parameters
bounding_box_parameters = get_bb_params(img_name)

# Apply augmentation
augmented_image = apply_augmentation(transform, image, bounding_box_parameters)

# Display the augmented image with bounding boxes
cv2.imshow('Augmented Image', augmented_image)
cv2.waitKey(0)
cv2.destroyAllWindows()