## Load Images: Load all images from the specified folder.

In [1]:
import os
from PIL import Image
import random
import itertools
from collections import defaultdict
from typing import List, Tuple, Dict, Literal
import torch
import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from pathlib import Path

def load_images_from_folder(folder_path):
    images = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder_path, filename)
            img = Image.open(img_path).convert("RGBA")
            images.append((filename, img))
    return images


image_folder = "/home/temesgen_gebreabzgi/semantic_image_and_text_alignment/data/Challenge_Data/Assets/0792c911c288bd241c1e2a2b64bce488"
images = load_images_from_folder(image_folder)
images

[('game_4.png', <PIL.Image.Image image mode=RGBA size=571x355>),
 ('game_2.png', <PIL.Image.Image image mode=RGBA size=299x94>),
 ('game_6.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('end video.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('Layer 2.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('engagement_animation_1.png',
  <PIL.Image.Image image mode=RGBA size=142x173>),
 ('move.png', <PIL.Image.Image image mode=RGBA size=597x900>),
 ('tap.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('landing_1.jpg', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('game_5.png', <PIL.Image.Image image mode=RGBA size=647x476>),
 ('black.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('cesar - video -end.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('ingredients-2.png', <PIL.Image.Image image mode=RGBA size=319x83>),
 ('_preview.png', <PIL.Image.Image image mode=RGBA size=600x900>),
 ('cta.png', <PIL.Image.Image image mode=RGBA size=5

### Object detection

In [2]:
# Object Detection model
def load_detection_model():
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    model.eval()
    return model

def detect_objects(image: Image.Image, model) -> List[Dict[str, Tuple]]:
    transform = T.Compose([T.ToTensor()])
    img_tensor = transform(image)
    predictions = model([img_tensor])[0]
    return predictions

def label_objects(predictions) -> Dict[str, Tuple]:
    # Assuming we have a function that maps detection labels to our categories
    labels = predictions['labels']
    boxes = predictions['boxes']
    labeled_boxes = {}
    for label, box in zip(labels, boxes):
        category = map_label_to_category(label)
        labeled_boxes[category] = box
    return labeled_boxes

In [None]:
"""def detect_objects(image):
    # Placeholder for object detection logic
    detected_objects = ["logo", "text", "background_image", "CTA_button", "end_frame"]
    return detected_objects

# Detect objects in each image
image_objects = [(filename, detect_objects(img)) for filename, img in images]

# Assuming images is a list of tuples (filename, img)
image_objects = [(filename, detect_objects(img, model)) for filename, img in images]
"""

In [3]:
def map_label_to_category(label) -> str:
    # Map object detection label to our categories
    label_map = {1: "Logo", 2: "CTA Button", 3: "Icon", 4: "Product Image", 5: "Text Elements", 6:"Background", 7:"End frame"}
    return label_map.get(label.item(), "Unknown")


In [4]:

# Load detection model
model = load_detection_model()

# Detect objects and label them
labeled_images = []
for filename, image in images:
    predictions = detect_objects(image, model)
    labeled_boxes = label_objects(predictions)
    labeled_images.append((filename, image, labeled_boxes))

print("Labeled images: ", labeled_images)




RuntimeError: The size of tensor a (4) must match the size of tensor b (3) at non-singleton dimension 0

In [None]:
def create_ad_frame(images, frame_size=(320, 480)):
    # Create a blank frame
    frame = Image.new("RGBA", frame_size, (255, 255, 255, 0))
    
    for filename, img, category in images:
        if category == "Background":
            frame.paste(img.resize(frame_size), (0, 0))
        elif category == "Logo":
            frame.paste(img.resize((80, 80)), (10, 10), img)
        elif category == "CTA Button":
            frame.paste(img.resize((100, 50)), (110, 400), img)
        elif category == "Product Image":
            frame.paste(img.resize((100, 100)), (110, 150), img)
        elif category == "Text Elements":
            frame.paste(img.resize((280, 50)), (20, 100), img)
        elif category == "End Frame":
            frame.paste(img.resize(frame_size), (0, 0))
        # Add more conditions for other categories as needed

    return frame

### image detection using yolo8

In [None]:
from ultralytics import YOLO
model = YOLO("yolov8m.pt")
results = model.predict(image_folder)