### This script is for generating slices from main images for Selwyn dataset for Mechanical Turk labelling tasks, based on Sliding window with overlap.
### Generates sub images, save only images where vehicles exist and validates and saves images and their corresponding image, vehicle, class name and location details to respective imagename.csv files.

In [1]:
from PIL import Image
import numpy as np
from pathlib import Path
import cv2
import os
import re
import matplotlib.pyplot as plt
import glob
import urllib.request, json
from io import BytesIO
import pandas as pd
import requests
from urllib.parse import urlparse
from numpy.lib.function_base import append

Image.MAX_IMAGE_PIXELS = None

### Validate slices generated from sliding window with overlap.

In [2]:
def validate_slices(output_directory : str, num_xy_points:tuple, window_size:tuple) -> bool :
    print(num_xy_points)
    assert os.path.exists(output_directory), "Output directory does not exist %s. Please check and try again."%str(output_directory)
    assert len(num_xy_points) > 0 and all(num_xy_points) and num_xy_points.count(0) == 0, "Number of point tuple is 0 or None. Please check and try again."
    assert len(window_size) > 0 and all(window_size) and window_size.count(0) == 0, "Window size tuple is empty is empty %d. Please check and try again."%window_size
    
    #validate exported files, format, count of files generated 
    #files = os.listdir(output_directory)
    
    files = glob.glob(os.path.join(output_directory,"*.png"))
    files.sort(key=os.path.getmtime, reverse=False)
    print(output_directory, len(files))
    saved_files = [file for file in files if re.search(r"\d+(-)\d+(-)[0-9]+[0-9]*(-)+[0-9]+[0-9]*(.tif|.png)",str(file))]    
    tot_points = num_xy_points[0]*num_xy_points[1]
    #check all files with tif since this method generated rows * count images and filenames
    print(len(saved_files), tot_points)
    assert len(saved_files) == tot_points, "Missing or incorrect number of cropped sub-images. Check exported sub-images!"
    print("Exported sub-images count look good!")
    
def is_url(image_url):
    if type(image_url) != str:
        return False
    try:
        response = requests.get(image_url)
        print("URL is valid and exists on the internet ", response.ok)
    except requests.ConnectionError as exception:
        print("URL does not exist on Internet")
        return False
    return True

### Validate image slices with vehicles

In [3]:
def validate_slices_with_vehicles(output_directory : str, tot_images:int, window_size:tuple) -> bool :
    print(tot_images)
    assert os.path.exists(output_directory), "Output directory does not exist %s. Please check and try again."%str(output_directory)
    assert len(window_size) > 0 and all(window_size) and window_size.count(0) == 0, "Window size tuple is empty is empty %d. Please check and try again."%window_size
    #validate exported files, format, count of files generated 
    
    #files = os.listdir(output_directory)
    files = glob.glob(os.path.join(output_directory,"*.png"))
    files.sort(key=os.path.getmtime, reverse=False)
    
    print(output_directory, len(files))
    saved_files = [file for file in files if re.search(r"\d+(-)\d+(-)[0-9]+[0-9]*(-)+[0-9]+[0-9]*(.tif|.png)",str(file))]    
    
    #check all files with tif since this method generated rows * count images and filenames
    print(len(saved_files), tot_images)
    assert len(saved_files) == tot_images, "Missing or incorrect number of cropped sub-images. Check exported sub-images!"
    print("Exported sub-images count look good!")
    return True

### Given window size, length of side, overlap, generate points along the given length

In [4]:
def get_slide_points(actual_size:int, window_size:int, overlap=0.1) -> list():
    """This function is to use a sliding window technique, is aka brute force still (without using numpy.slide_tricks/as_strided or scipy shortcuts).
    Returns points array for a given axis : points along X-axis i.e along the width of the image or y-axis i.e along height of the image
    The strides along x-axis remain same as sliding window moves to different rows along y-axis and viceversa.
    So we just need one pass along x-axis or y-axis.

    Args:
        actual_size (int): [actual size of the image so strides do not overrun the bounds]
        window_size (int): [size of window for cropping later, is always less than actual_size]
        overlap (float, optional): [Fraction of overlap for a stride along the axis. Defaults to 0.1]

    Returns:
        [List(int)]: [returns points array along a given axis length for each stride]
    """
    assert actual_size > 0, "Incorrect size provided for parameter 1. Please check and try again."
    assert window_size > 0, "Incorrect size provided for parameter 2. Please check and try again."
    assert overlap > 0 and type(overlap) == float , "Incorrect overlap value provided for parameter 3. It has to be  Please check and try again."

    points = [0]
    assert actual_size > window_size and actual_size > 0 and window_size > 0, "Incorrect actual size and window size. Please check function input params definitions."
    #for the purpose of ensuring overlap, calculate stride from window size and overlap so stride is within the window size we want to crop
    stride = int(window_size * (1-overlap))
    counter = 1
    #initialize and fill first point and append it
    #first point is point at end of first stride
    this_point = stride * counter
    points.append(this_point)
    counter += 1
    #start from second stride onwards and check if the point is running over the actual size
    while this_point + window_size < actual_size:
        #move on to next stride
        this_point = stride * counter
        points.append(this_point)
        counter += 1
    #loop ended - since this_point is overrunning the window size, so instead just add actual size - window size as this_point 
    points.append(actual_size-window_size)
    assert len(points) == counter+1, "Incorrect number of strides."
    return points

### For each annotations, image file pair, generate a dictionary for with numpy array locations for search vehicles from an Image slice.

In [5]:
def add_answers(annotationsfile,image_file):
    answer_dict = []
    class_loc = []
    with open(annotationsfile) as json_file:
        data = json.load(json_file)
    for key, values in data['locations'].items():
        xy = []
        x= [] 
        y=[]
        if len(values) != 0:
            for value in values:
                x.append(int(value['x']))
                y.append(int(value['y']))
               
            class_loc.append({key:[np.array(x), np.array(y)]})#class_loc.append({key:np.array(xy)})
        else:
            class_loc.append({key:[]})
    return class_loc, answer_dict

### Get annotations that exist in a given cropped image and return vehicle class locations dictionary

In [6]:
def get_locations_from_image(i:int,j:int,class_loc:list, windowsize:int,newimage_name):
    vehicles_loc = {}
    vehicles_loc['subimagename'] =newimage_name
    count = 0
    #i rows, j columns
    for locset in class_loc:
        k,arr = *locset.keys(), *locset.values()
        if len(arr) != 0:
            x,y = arr

            x2 = np.where((x >= i) & (x<= i+300))[0]
            y2 = np.where((y >= j) & (y <= j+300))[0]
            ind = np.intersect1d(x2,y2)
            
            common_pts = []
            if len(ind) > 0:
                #print(ind,x[ind],y[ind],newimage_name,k)
                common_pts = [x[ind].tolist(),y[ind].tolist()]
            if len(common_pts) > 0:
                vehicles_loc[k]=common_pts
                count += len(ind)
                #print(newimage_name, x,y,i,j,arr[:,0:1],arr[:,1:],common_pts,arr[:,:][common_pts].tolist())
                #print(arr[:,0:1], i,i+windowsize, (arr[:,0:1] >= i) & (arr[:,0:1] <= i+windowsize)[0],arr[:,1:], j, j+windowsize,(arr[:,1:] >= j) & (arr[:,1:] <= j+windowsize)[1],vehicles_loc[k])
    if len(vehicles_loc.keys()) == 1:
        vehicles_loc = None
    else:
        vehicles_loc['num_vehicles'] = count
    return vehicles_loc

In [7]:
classes = ["van_rv","unknown","truck","bus","trailer_small","specialized","trailer_large","small"]

### Generate images with Sliding window with overlap while looking for vehicle annotations that exist in each sub image

In [8]:
def sliding_window_with_overlap( image_path,class_loc, output_directory,window_width:int, overlap=0.1 ):
    answers = []
    filename, filext = os.path.splitext(os.path.basename(image_path) )
    #print(filename)
    assert filename and filext, "Please check filename and file extension!"
    im = cv2.imread(str(image_path))
    
    #output_directory = os.path.join(os.getcwd(),"temp")
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)
    print(output_directory)
        
    #print(len(class_loc))
    assert im.shape, "Image file could not be opened! Please check the image details and try again."
    imgheight, imgwidth, _ = im.shape
    print(imgheight, imgwidth)
    window_width = window_width
    window_height = window_width #the window height as per aspect ratio of the image, we only give a window width. int(window_width * imgheight/imgwidth)

    assert window_width > 0 and window_height > 0, "Window height and width must be greater than 0. Please check rows and columns values."
    h,w,_ = im.shape
        
    #get points of each stride
    x_points = get_slide_points(imgwidth, window_width, overlap=overlap)
    y_points = get_slide_points(imgheight, window_height, overlap=overlap)
    print(len(x_points))
    print(len(y_points))
    count = 0
    for x_counter, i in enumerate(y_points, 1):
        for y_counter,j in enumerate(x_points, 1):
            
            # when generating images without locations, uncomment following code 
            #cropped = im[i:i+window_height, j:j+window_width]
            #assert cropped.shape, "crop failed, please check again"
            #print(i,j,x_counter,y_counter)
            #print(len(vehicle_locs.values()))
            #try:

                #cv2.imwrite(os.path.join(output_directory, newimage_name), cropped)
                #assert os.path.exists(os.path.join(output_directory, newimage_name)), "Cropped image "+newimage_name+" not saved!"
#             except Exception:
#                 raise(Exception())
            file = filename.replace("_image","").replace("_","-")
            newimage_name = file+"-"+str(j)+"-"+str(i)+".png"
            vehicle_locs = get_locations_from_image(i,j,class_loc, window_width,newimage_name)
            
            # Cropped images only if vehicle locations exist for a given cropped image
            if vehicle_locs:
                answers.append(vehicle_locs)
                #print(vehicle_locs, answers)
                count += 1
                cropped = im[i:i+window_height, j:j+window_width]
                assert cropped.shape, "crop failed, please check again"
                cv2.imwrite(os.path.join(output_directory, newimage_name), cropped)
                assert os.path.exists(os.path.join(output_directory, newimage_name)), "Cropped image "+newimage_name+" not saved!"
                #print(i,j,x_counter,y_counter)
                #print(len(vehicle_locs.values()))
                #print(vehicle_locs)
    is_valid = validate_slices_with_vehicles(output_directory, count, (window_width, window_height))
    assert is_valid, "Validation failed for cropped sub-images! Please check and try again."
    return answers

### The following cell is for Selwyn dataset 

In [9]:
outputdirname = "task2"
path = Path(r'C:\Users\exx\Documents\lab')

#get new folder's storage entry folder, there on, folders likely similar ?
filedir = os.path.join(os.path.join(r'C:\Users\exx\Documents\lab'), "LINZ","Final","001_selwyn-0125m-urban-aerial-photos-2012-2013")

main_folders = [os.path.join(filedir,name) for name in os.listdir(filedir)]
for f in main_folders:
    this_key = None
   
    for root, dir, files in os.walk(f):
        #ignore ipynb_checkpoints folders
        if re.search(".ipynb_checkpoints",os.path.basename(Path(root))):
            continue
        if len(dir) != 0:
            #check if Annotation galleries exist
            galleries = [ d for d in dir if re.search("\d+\sFinal Annotation Galleries",str(d))]
            #check if main image file exists 000X_000X_image.png exists ?
            image_files = [ file for file in files if re.search("\d+(_)\d+(_)image.png",file.lower())]
            annotation_files = [ file for file in files if re.search("\d+(_)\d+(_)annotations.json",file)]
        #if dir is empty, we continue
        #valid folders and files exists ?
        if len(galleries) > 0 and len(image_files) > 0 and len(annotation_files)>0: #and "0001_0001" in image_files[0]:
            
            #print(root)
            imagefilepath = Path(root).joinpath( image_files[0])
            print(imagefilepath)
            
            foldername = str(os.path.splitext(os.path.basename(imagefilepath))[0]).replace("_image","")
            annotationsfile = Path(root, annotation_files[0])
            
            class_loc, answerdict = add_answers( annotationsfile, image_files[0])
            print(foldername)
            #print(class_loc, answerdict)
            
            if not os.path.exists(os.path.join(path, outputdirname)):
                os.mkdir(os.path.join(path, outputdirname))
                
            outputdirpath = os.path.join(path, outputdirname, "test4")
            if not os.path.exists(outputdirpath):
                os.mkdir(outputdirpath)
            p = os.path.join(outputdirpath, foldername)
            if not os.path.exists(p):
                os.mkdir(p)
           # print("p",p,annotationsfile)
            
            answers = sliding_window_with_overlap(imagefilepath,class_loc, p, window_width=300, overlap=0.2)
            #print(answers)
            
            result = pd.DataFrame.from_dict(answers)
            result.to_csv(os.path.join(outputdirpath,foldername+".csv"), index=False)

C:\Users\exx\Documents\lab\LINZ\Final\001_selwyn-0125m-urban-aerial-photos-2012-2013\0001\0001\0001_0001_image.png
0001_0001
C:\Users\exx\Documents\lab\task2\test4\0001_0001
11141 24174
102
48
14
C:\Users\exx\Documents\lab\task2\test4\0001_0001 14
14 14
Exported sub-images count look good!
C:\Users\exx\Documents\lab\LINZ\Final\001_selwyn-0125m-urban-aerial-photos-2012-2013\0001\0002\0001_0002_image.png
0001_0002
C:\Users\exx\Documents\lab\task2\test4\0001_0002
13134 23314
98
56
31
C:\Users\exx\Documents\lab\task2\test4\0001_0002 31
31 31
Exported sub-images count look good!
C:\Users\exx\Documents\lab\LINZ\Final\001_selwyn-0125m-urban-aerial-photos-2012-2013\0001\0003\0001_0003_image.png
0001_0003
C:\Users\exx\Documents\lab\task2\test4\0001_0003
12441 14432
61
53
61
C:\Users\exx\Documents\lab\task2\test4\0001_0003 61
61 61
Exported sub-images count look good!
C:\Users\exx\Documents\lab\LINZ\Final\001_selwyn-0125m-urban-aerial-photos-2012-2013\0001\0004\0001_0004_image.png
0001_0004
C:\U