In [8]:
import os
import sys
from pathlib import Path
from datetime import datetime

import boto3
from botocore.exceptions import ClientError

import json
import math

import pandas as pd

config = []
DEFAULT_FILENAME = "image_urls.csv"
path = Path(os.getcwd())

In [2]:
#load json config
config_file_path = os.path.join(path.parent.parent,"config", "config.json")
if os.path.exists(config_file_path) :
    with open(config_file_path, "r") as jsonfile:
        config = json.load(jsonfile)
        print(config)
        assert config, "Config JSON has errors or is empty."

{'overlap': 0.2, 'tile_size': 400, 'class_thumbnail_sizes': {'bus': 171, 'van_rv': 127, 'small': 101, 'specialized': 111, 'truck': 223, 'trailer_large': 219, 'trailer_small': 101, 'unknown': 127}, 'marker_color': [200, 200, 200, 255], 'point_color': [255, 0, 0], 'bb_color': [255, 0, 0], 'bucketname': 'mturk-s3-cg', 'classify_images_folder': 'task2-images/LINZ/', 'bb_images_folder': 'task1-images/LINZ/', 's3_url': 'https://mturk-s3-cg.s3.amazonaws.com/'}


In [3]:
class_thumbnail_sizes = config["class_thumbnail_sizes"]
marker_color = tuple(config["marker_color"])
point_color = tuple(config["point_color"])
bb_color = config["bb_color"]
tile_size = config["tile_size"]
overlap = config["overlap"]
stride = int(tile_size * (1-overlap))
bucketname = config["bucketname"]
classify_images_folder = config["classify_images_folder"]
s3_url = config["s3_url"]

In [4]:
def get_all_objects(bucketname: str):
    """Get all objects from s3. This method assumes AWS S3 configuration has been successful and looks for a folder ".aws"

    Args:
        bucketname (str): Enter the bucket name you want to lookup
    """
    s3r = boto3.resource('s3')
    bucket = s3r.Bucket('mturk-s3-cg')
    fileslist = list(bucket.objects.all())
    return fileslist

def filter_objects(bucketname:str,fileextension:str, folderpath:str):
    """Filter a bucket and look for a specific folder path. This method assumes AWS S3 configuration has been successful and looks for a folder ".aws"

    Args:
        bucketname (str): Bucketname to look for in S3
        fileextension (str): file extension to filter for
        folderpath (str): folderpath to retrieve all file objects

    Returns:
        [type]: [description]
    """
    if fileextension == "":
        fileextension = ".png"
    files_in_bucket = get_all_objects(bucketname)
    fileslist = [s3_url+f.key for f in files_in_bucket if folderpath in f.key and ".png" in f.key and "task2-images/LINZ/mar10/" not in f.key and "task2-images/LINZ/examples/" not in f.key]
    print("Number of image urls in folder : %s with %s is %d."%(bucketname+"/"+folderpath, fileextension, len(fileslist)))
    return fileslist

In [5]:
def generate_images_urls(num_columns=20, fileslist=[], csvfilename=DEFAULT_FILENAME):
    """Generate Image urls csv file dependeing on length of each column.

    Args:
        num_columns (int, optional): [description]. Defaults to 20.
        fileslist (list, optional): [description]. Defaults to [].
        csvfilename ([type], optional): [description]. Defaults to DEFAULT_FILENAME.
    """
    assert num_columns > 0, "Cannot generate image urls for ZERO columns."
    if os.path.exists(os.path.join(os.getcwd(),csvfilename)):
        print("The file name %s already exists. The file will be renamed by this method."%csvfilename)

    #duplicate filename ? rename it.
    if os.path.exists(os.path.join(os.getcwd(),csvfilename)):
        filename, ext = os.path.splitext(csvfilename)
        csvfilename = filename+datetime.now().strftime('%Y%m%d')+ext
    
    #How many image urls in each column ?
    columnlength = int(len(fileslist)/num_columns)
    hitimages = {}
    count = 1
    adjusted_column_length = 0

    #generate image urls and adjust the few last remaining leftout imageurls to same length for csv
    for i in range(0,len(fileslist),columnlength):
        #print(len(fileslist[i:i+columnlength]),"image"+str(count)+"_url")
        hitimages["image"+str(count)+"_url"] = fileslist[i:i+columnlength]
        k = "image"+str(count)+"_url"
        if len(hitimages[k]) < columnlength:
            emptydata_for_csv = [" " for i in range(len(hitimages[k]), columnlength)]
            hitimages[k].extend(emptydata_for_csv) #O(n) - usually for last column. total columnlength - actual number of image urls left after distributing in all previous columns.
            print("Extended length is now %d for %s"%(len(hitimages[k]), str(k)))
            adjusted_column_length += len(emptydata_for_csv)
        count += 1

    assert len(fileslist)+adjusted_column_length == (count-1)*columnlength, "Incorrect length. Please check."

    #convert to csv and save
    result = pd.DataFrame.from_dict(hitimages)
    result.to_csv(os.path.join(os.getcwd(),csvfilename), index=False)
    assert os.path.exists(os.path.join(os.getcwd(),csvfilename)), "Unable to create CSV file. Please check and try again."
    print("******************************************************************************************")
    print("Image urls are saved to csv file : %s" %(os.path.join(os.getcwd(),csvfilename)))

In [10]:
def start(bucketname: str, folderpath:str, fileextension=".png", num_columns=20, csvfilename=DEFAULT_FILENAME):
    """Given an S3 bucket name, bucket's folderpath, fileextension to filter, we filter and get files (that are keys of objects with path).
        Create a csv file from the urls. This ensures only the available, valid, available uploaded files to S3. 

    Args:
        bucketname (str): bucketname you want to look up in
        folderpath (str): folder path within the bucketname to look for
        fileextension (str, optional): File extension to filter . Defaults to ".png".
        num_columns (int, optional): This is delimited by number of images in a single HIT. Defaults to 20.
        csvfilename ([type], optional): target csv file to save all urls (since we use it for bulk HIT creation on MTurk). Defaults to DEFAULT_FILENAME.
    """
    filtered_files_list = filter_objects(bucketname, fileextension, folderpath)
    generate_images_urls(num_columns,filtered_files_list,csvfilename )


In [12]:
# Call start method. There is a config file to edit and select more preferences
start(bucketname,"task1/batch100/", ".png", 25,  "selwyn_batch_100_image_urls.csv")

Number of image urls in folder : mturk-s3-cg/task1/batch100/ with .png is 212.
Extended length is now 8 for image27_url
******************************************************************************************
Image urls are saved to csv file : C:\Users\exx\Documents\lab\code\classification\notebooks\selwyn_batch_100_image_urls.csv
