In [1]:
#! pip install ultralytics 
#! pip install psycopg2
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
from ultralytics import YOLO
import psycopg2
import im_utils
import db_ops
import pandas as pd

import boto3
s3_client = boto3.client('s3')

Connecting to the PostgreSQL database...
Connection successful


#### Get the full list of OLM images

In [2]:
# Get a full list of box files already in the processed bucket
objects = s3_client.list_objects_v2(Bucket='olm-processed-s3')
processed_files_uris = []
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket='olm-processed-s3', Prefix='box')

for page in pages:
    processed_files_uris = processed_files_uris + [dict['Key'] for dict in page['Contents']]

In [9]:
#Make the new box table
query = """

drop table if exists box;

CREATE TABLE "box"(box_id SERIAL PRIMARY KEY
                    ,image_id INTEGER
                    ,brand_id INTEGER
                    ,cat_id INTEGER
                    ,xmin_coord SMALLINT
                    ,ymin_coord SMALLINT
                    ,xmax_coord SMALLINT
                    ,ymax_coord SMALLINT
                    ,box_pic_uri VARCHAR(255)
                    ,conf NUMERIC(5,4)
);

"""
db_ops.run_single_query(query)

In [3]:
# Get a df of unprocesed images
query = f'''
SELECT DISTINCT ON (i.image_id)
	 i.s3_uri AS uri
	, i.image_id as image_id
	, b.brand_id as brand_id 
	, ci.cat_id as cat_id
FROM image AS i 
	LEFT JOIN brand_image AS bi
		ON i.image_id = bi.image_id
	LEFT JOIN brand as b
		ON b.brand_id = bi.brand_id 
	LEFT JOIN category_image as ci
		ON i.image_id = ci.image_id
ORDER BY i.image_id	;'''
image_uri = db_ops.select_to_pandas(query, True, True)
image_uri.head(3)

  df = pd.read_sql_query(query, connection)


Unnamed: 0,uri,image_id,brand_id,cat_id
0,s3://olm-pics-s3/112/Pftq4550WzgZA5euhgIEsLHGs...,1,,47.0
1,s3://olm-pics-s3/112/HKXafwXvrpBQjoWC9aPtSY8lT...,2,,39.0
2,s3://olm-pics-s3/112/69M58FGWNVp1ZTObF0YmkMfGN...,3,,122.0


In [4]:
image_uri['brand_id'] = image_uri['brand_id'].fillna(value = 111).astype(int)
image_uri['cat_id'] = image_uri['cat_id'].fillna(value = 20).astype(int)
image_uri['file'] = image_uri['uri'].apply(lambda x: x.split('/')[-1].split('.')[0])
len(image_uri)

27388

In [5]:
# Get a df of brand names
query = f'''
SELECT *
FROM brand'''
brand_df = db_ops.select_to_pandas(query, True, True)
brand_df.head(3)

  df = pd.read_sql_query(query, connection)


Unnamed: 0,brand_id,brand_name,score,percent,tot_images
0,111,Unknown,0.0,,
1,35,dr_pepper,0.43366,,
2,29,coca_cola,3.77853,,


In [45]:
names = ['amstel_light', 'aquafina', 'bud_light', 'budweiser', 'burger_king', 'cheetos', 'coca_cola', 'coors', 'corona', 'dasani', 'deer_park', 'doritos', 'dr_pepper', 'dunkin', 'fanta', 'fireball', 'gatorade', 'great_value', 'guinness', 'heineken', 'heinz', 'kirkland', 'lays', 'marlboro', 'mccafe', 'mcdonalds', 'michelob', 'miller_light', 'modelo', 'monster', 'mountain_dew', 'natural_light', 'nestle_pure_life', 'newport', 'niagara', 'pepsi', 'powerade', 'red_bull', 'reeses', 'seven_eleven', 'snickers', 'sprite', 'starbucks', 'stella', 'white_claw']

In [52]:
for name in names:
    if name in list(brand_df['brand_name']):
        pass
    else:
        print(name)

#### Load the label/no-label classifier model

In [6]:
model = YOLO('/home/ubuntu/box_model/logo-yolo.pt')

#### Copy the image from the original S3, extract the boxes

In [8]:
for i in range(len(image_uri)):
    print('-'*30)
    try:
        # Get the file names and uris
        original_im_uri = image_uri['uri'].iloc[i]
        image_id = image_uri['image_id'].iloc[i]
        tmp_im_folder = '/home/ubuntu/temp-images'
        im_file = original_im_uri.split('/')[-1]
        scaled_im_uri = original_im_uri.replace('-pics-s3', '-processed-s3/scaled').split('.')[0] + '.jpg'
        scaled_im_local_path = os.path.join(tmp_im_folder, 'scaled', im_file.split('.')[0] + '.jpg')
        boxes_s3_path = scaled_im_uri.replace('/scaled', '/boxes_mult')
        boxes_s3_path = '/'.join(boxes_s3_path.split('/')[:-1]) + '/'
        
        # Download the image from olm bucket, scale it and write to the local folder
        os.system(f"aws s3 cp {original_im_uri} {tmp_im_folder}") 

        image_original = cv2.imread(tmp_im_folder+'/'+im_file)
        image_scaled = im_utils.ScaleImage(image_original, width = 640)
        cv2.imwrite(scaled_im_local_path, image_scaled)

        # Run YOLO inference
        results = model(image_scaled)
            
        boxes_df = pd.DataFrame(columns = ['image_id', 'brand_id', 'cat_id', 'xmin_coord', 'ymin_coord', 'xmax_coord', 'ymax_coord', 'box_pic_uri','conf'])
        w_ratio = image_original.shape[1]/image_scaled.shape[1]
        h_ratio = image_original.shape[0]/image_scaled.shape[0]
        for i in range(len(results[0].boxes.cls)):# Iterate over each box   
            # Get the coordinates of the box on the original image
            xmin, ymin, xmax, ymax = map(int, results[0].boxes.xyxy[i]) 
            xmin = int(xmin * w_ratio)
            xmax = int(xmax * w_ratio)
            ymin = int(ymin * h_ratio)
            ymax = int(ymax * h_ratio)
            postfix = f"-{xmin}-{ymin}-{xmax}-{ymax}"
            box_file = im_file.split('.')[0]+postfix+'.jpg'
            
            # Get the section of the original image that is in the box
            box_im = image_original[ymin:ymax, xmin:xmax,:]
            # Save the box locally as an individual file
            cv2.imwrite(tmp_im_folder + '/scaled/' + box_file, box_im)

            # Add line with the box data to the df for db insertion
            idx = len(boxes_df)
            brand_name = results[0].names[int(results[0].boxes.cls[i])]
            boxes_df.loc[idx, 'brand_id'] = brand_df[brand_df['brand_name'] == brand_name]['brand_id'].iloc[0]
            boxes_df.loc[idx, 'cat_id'] = 0
            boxes_df.loc[idx, 'xmin_coord'] = xmin
            boxes_df.loc[idx, 'ymin_coord'] = ymin
            boxes_df.loc[idx, 'xmax_coord'] = xmax
            boxes_df.loc[idx, 'ymax_coord'] = ymax
            boxes_df.loc[idx, 'image_id']= image_id
            boxes_df.loc[idx, 'box_pic_uri']= boxes_s3_path + box_file
            boxes_df.loc[idx, 'conf']= float(results[0].boxes.conf[i])
        
            # Upload the box file to processed s3 bucket
            os.system(f"aws s3 cp {tmp_im_folder + '/scaled/' + box_file} {boxes_s3_path}") 

        #Delete old boxes for this image
        query = f'''
        DELETE FROM box
        WHERE image_id = {image_id}
        '''
        db_ops.run_single_query(query)
        #Update box table with the new boxes
        db_ops.add_df(boxes_df, "box")

        #Clean up temp folders
        os.system(f"find ~/temp-images/ -maxdepth 2 -type f -delete")
    except Exception as e:
        print(e)

------------------------------
download: s3://olm-pics-s3/112/Pftq4550WzgZA5euhgIEsLHGs63tXSHAqCP07WKl.jpeg to ../../temp-images/Pftq4550WzgZA5euhgIEsLHGs63tXSHAqCP07WKl.jpeg

0: 480x640 (no detections), 69.7ms
Speed: 5.9ms preprocess, 69.7ms inference, 71.9ms postprocess per image at shape (1, 3, 480, 640)
------------------------------
download: s3://olm-pics-s3/112/HKXafwXvrpBQjoWC9aPtSY8lTGcdoZVlPItqNQkK.jpeg to ../../temp-images/HKXafwXvrpBQjoWC9aPtSY8lTGcdoZVlPItqNQkK.jpeg

0: 640x480 1 doritos, 74.2ms
Speed: 2.3ms preprocess, 74.2ms inference, 714.7ms postprocess per image at shape (1, 3, 640, 480)
upload: ../../temp-images/scaled/HKXafwXvrpBQjoWC9aPtSY8lTGcdoZVlPItqNQkK-630-1695-864-1902.jpg to s3://olm-processed-s3/boxes_mult/112/HKXafwXvrpBQjoWC9aPtSY8lTGcdoZVlPItqNQkK-630-1695-864-1902.jpg
INSERT INTO box (image_id,brand_id,cat_id,xmin_coord,ymin_coord,xmax_coord,ymax_coord,box_pic_uri,conf) VALUES ('2','34','0','630','1695','864','1902','s3://olm-processed-s3/boxes_mult/112