In [None]:
from __future__ import print_function

from osgeo import gdal
import rasterio
import pandas as pd
import os 
import boto3

In [None]:
try:
    os.makedirs("images/niihau", 0755)  
except OSError:
    print("folder already exists")

try:
    os.makedirs("niihau", 0755)  
except OSError:
    print("folder already exists")


In [None]:

root_folder = os.getcwd()
root_folder = "/tmp"
print(root_folder)

In [None]:
not_default_keys = True

import boto3 
boto3.setup_default_session(profile_name='hawaii')
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
version='v2'

bucket_name = 'hawaii-marine-debris'
file_path_template = "{}/jpg/{}"
downloaded_tile = set()
csv_rows = []
islands= ['niihau']#'lanai','bigisland',''
local_tmp_folder = "/tmp/{}"

annotatino_s3key_path = "{island}/annotations.csv"
geojson_keys = [
    "niihau/final_marine_debris_database_NI_UTM4N_boxes.geojson",
    "niihau/niihau_tileindex.geojson",    
]


s3_resource.Bucket(bucket_name).download_file("niihau/final_marine_debris_database_NI_UTM4N_boxes.geojson", "{}/final_marine_debris_database_NI_UTM4N_boxes.geojson".format(root_folder))
s3_resource.Bucket(bucket_name).download_file("niihau/niihau_tileindex.geojson", "{}/niihau_tileindex.geojson".format(root_folder))
s3_resource.Bucket(bucket_name).download_file("niihau/deduped_annotations.csv", "{}/niihau/annotations.csv".format(root_folder))



In [None]:
s3_resource.Bucket(bucket_name).download_file("niihau/niihau_tileindex.csv", "{}/niihau_tileindex.csv".format(root_folder))


In [None]:
tile_index_df = pd.read_csv("{}/niihau_tileindex.csv".format(root_folder))
len(tile_index_df)

In [None]:
s3_resource.Bucket(bucket_name).download_file("niihau/deduped_annotations.csv", "{}/niihau/annotations.csv".format(root_folder))

existing_annotation_df = pd.read_csv("{}/niihau/annotations.csv".format(root_folder))
existing_annotation_df['file_path'] = existing_annotation_df['s3_key'].replace({'niihau/jpg': '{}/niihau/images'.format(root_folder)}, inplace=False, regex=True)
existing_annotation_df[:5]

In [None]:
def download_from_s3(s3_key):
    basename = os.path.basename(s3_key)
    local_file_path = local_tmp_folder.format(basename)
    s3_resource.Bucket(bucket_name).download_file(s3_key, local_file_path)
    return local_file_path

def convert_tif_to_jpeg(tiff_image_path):
    src = rasterio.open(tiff_image_path)
    jpg_path = tiff_image_path.replace(".tif", ".jpg")
    data = src.read()
    profile = src.profile
    result = data.astype(rasterio.uint8, casting='unsafe', copy=False)
    profile.update(driver='jpeg')
    with rasterio.open(jpg_path, 'w', **profile) as dst:
        dst.write(result)
    return jpg_path
                
def upload_to_s3(local_file_path, s3_path):
    s3_client.upload_file(local_file_path, bucket_name, s3_path)

def find_min_xy(coordinates):
    x0 = min([i[0] for i in coordinates])
    x1 = max([i[0] for i in coordinates])
    y0 = min([i[1] for i in coordinates])
    y1 = max([i[1] for i in coordinates])
    return x0,y0,x1,y1


def get_pixel_coordinates(image_path, coordinates):
    driver = gdal.GetDriverByName('GTiff')
    dataset = gdal.Open(image_path)
    band = dataset.GetRasterBand(1)

    cols = dataset.RasterXSize
    rows = dataset.RasterYSize

    transform = dataset.GetGeoTransform()

    xOrigin = transform[0]
    yOrigin = transform[3]
    pixelWidth = transform[1]
    pixelHeight = -transform[5]

    data = band.ReadAsArray(0, 0, cols, rows)

    x0, y0, x1, y1 = coordinates
    x0p = int((x0 - xOrigin) / pixelWidth)
    y0p = int((yOrigin - y0 ) / pixelHeight)
    x1p = int((x1 - xOrigin) / pixelWidth)
    y1p = int((yOrigin - y1 ) / pixelHeight)
    return x0p, y0p, x1p, y1p

### Parse geojson 

In [None]:
#download all necessary geojson files:


In [None]:
import json
from pprint import pprint
#list of all debris annotated from UoH
with open("{}/final_marine_debris_database_NI_UTM4N_boxes.geojson".format(root_folder)) as f:
    geo_json = json.load(f)
len(geo_json['features'])
debris_boxes = geo_json['features']


with open("{}/niihau_tileindex.geojson".format(root_folder)) as f:
    tiles_geo_json = json.load(f)
len(tiles_geo_json['features'])
tiles_features = tiles_geo_json['features']


In [None]:
islands = set()
for d in debris_boxes:
    islands.add(d['properties']['island'])
print(islands)

In [None]:
#filter out specific data that we are looking for


target_islands = ["NI"]
def filter_debris_per_islands(target_islands=[]):
    objects = ['T','V','N','B','M','P','C','L','F','W','O']
    geo_json_in_search = []
    set_of_objs = set()
    for d in debris_boxes:
        if d['properties']['type'] not in objects:
            print("unknown object {}".format(d['properties']['type']))
        if d['properties']['island'] in target_islands:
            geo_json_in_search.append(d)
        set_of_objs.add(d['properties']['type'])

    print("# of debris :{}".format(len(geo_json_in_search)))
    return geo_json_in_search
print(set_of_objs)
# Debris categories:
# B = Buoys and floats
# C = Cloth	
# F = Foam 
# L = Line (single pieces of rope, not net)
# M = Metal
# N = Net
# P = Plastic
# T = Tire
# W = Processed wood
# V = Vessel
# O = Other 

In [None]:
len(geo_json_in_search)

In [None]:
geo_json_in_search[:1]

In [None]:

#find corresponding tile
tile_with_debris = []
tile_without_debris = set()
unique_debris_image_set = {}
max_number_of_negative_images = 10000
stop_at_reached_at_max_negative = False
print("searching {}".format(len(geo_json_in_search)))


bucket_name = 'hawaii-marine-debris'

s3key_path_template = "niihau/jpg/{}"

downloaded_tile = set()
csv_rows = []

print("Search for tile image based on annotation geojson")
for tile in tiles_features:
    x0,y0,x1,y1 = find_min_xy(tile['geometry']['coordinates'][0]) 
    has_debris = False
    
    for debris in geo_json_in_search:
        target = find_min_xy(debris['geometry']['coordinates'][0])
        if x0 < target[0] and y0 < target[1] and  x1 > target[2] and y1 > target[3]:
            print("with debris--{} unique debris -- {} without debris--{} ".format(len(tile_with_debris),len(unique_debris_image_set), len(tile_without_debris)),end='\r')
            json = {'label': d['properties']['type'],
             'annotation': target,
             'image_s3':tile['properties']['s3_path'],
             'unique_pt_id': debris['properties']['unique_pt_id']
            }
            tile_with_debris.append(json)
            unique_debris_image_set[debris['properties']['unique_pt_id']]=tile['properties']['s3_path']
            debris['s3_key']=tile['properties']['s3_path']
            has_debris = True

    if not has_debris:
        tile_without_debris.add(tile['properties']['tile_name'])
        if stop_at_reached_at_max_negative and len(tile_without_debris)>max_number_of_negative_images:
            break
    if len(unique_debris_image_set)>=len(geo_json_in_search) and len(tile_without_debris)>max_number_of_negative_images:
        break

print("with debris--{} without debris--{}".format(len(tile_with_debris),len(tile_without_debris)),end='\r')
#just find the first one for now. and let's think about how to deal with second images later.

In [None]:
# max_number_of_negative_images = 10000
# tile_without_debris = tile_without_debris[:max_number_of_negative_images]

In [None]:
for tif_name in tile_without_debris:
    s3filepath = debris['image_s3'].replace('https://s3-us-west-2.amazonaws.com/hawaii-marine-debris/','')
    tmpfile = "{}/images/niihau/{}".format(root_folder, os.path.basename(s3filepath))

    if tmpfile not in downloaded_tile:
        s3_resource.Bucket(bucket_name).download_file(s3filepath, tmpfile)
        downloaded_tile.add(tmpfile)
    jpg_path = convert_tif_to_jpeg(tmpfile)    
    s3_key = s3key_path_template.format(os.path.basename(jpg_path))
    print("Uploaded %s: %s -> %s -> %s" % (len(csv_rows),tmpfile, jpg_path, s3_key), end='\r')
    if tmpfile not in downloaded_tile:
        try:
            s3_client.head_object(
                Bucket=bucket_name,
                Key=s3_key
            )
        except:
            upload_to_s3(jpg_path, s3_key)
    csv_rows.append((s3_key,"","","","","")

In [None]:
import csv

with open('negative_tiles.csv','wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['s3_key','x0', 'y0','x1','y1', 'label'])
    for row in csv_rows:
        csv_out.writerow(row)

In [None]:
s3_client.upload_file('negative_tiles.csv', bucket_name, "niihau/negative_tiles.csv")

In [None]:

print("convert image to JPEG, and convert bounding box to pixels")

for debris in tile_with_debris:
    s3filepath = debris['image_s3'].replace('https://s3-us-west-2.amazonaws.com/hawaii-marine-debris/','')
    tmpfile = "{}/images/niihau/{}".format(root_folder, os.path.basename(s3filepath))

    if tmpfile not in downloaded_tile:
        s3_resource.Bucket(bucket_name).download_file(s3filepath, tmpfile)
        downloaded_tile.add(tmpfile)
    
    pixel_coordinates = get_pixel_coordinates(tmpfile, debris['annotation'])
    jpg_path = convert_tif_to_jpeg(tmpfile)    
    s3_key = s3key_path_template.format(os.path.basename(jpg_path))
    print("Uploaded %s: %s -> %s -> %s" % (len(csv_rows),tmpfile, jpg_path, s3_key), end='\r')
    if tmpfile not in downloaded_tile:
        try:
            s3_client.head_object(
                Bucket=bucket_name,
                Key=s3_key
            )
        except:
            upload_to_s3(jpg_path, s3_key)
    csv_rows.append((s3_key,)+ pixel_coordinates+(debris['label'],))
    #free up storage once it's uploaded
#     os.remove(jpg_path)
#     os.remove(tmpfile)

In [None]:
import csv
print(len(csv_rows))

with open('annotated_marine_debris.csv','wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['s3_key','x0', 'y0','x1','y1', 'label'])
    for row in csv_rows:
        csv_out.writerow(row)
        


In [None]:
s3_client.upload_file('annotated_marine_debris.csv', bucket_name, "niihau/annotations.csv")

In [None]:
annotated = pd.read_csv('annotated_marine_debris.csv',encoding='utf-8')
not_annotated = pd.read_csv('negative_tiles.csv',encoding='utf-8')
all_tiles_df = annotated.union(not_annotated)
all_tiles_df.to_csv("all_tiles.csv", encoding='utf-8')
s3_client.upload_file('all_tiles.csv', bucket_name, "niihau/all_tiles.csv")


In [None]:
import boto

s3 = boto.connect_s3(profile_name='hawaii')
bucket = s3.get_bucket('hawaii-marine-debris')

for o in bucket.list(prefix='lanai/', delimiter='/'):
    print(o.name)

In [None]:
import io
info_df=None
tile_img_with_debris = []
tile_img_without_debris = []
for o in bucket.list(prefix='lanai/615/'):
    if o.name.endswith('.csv'):
        print(o.name)
        obj = s3_client.get_object(Bucket=bucket_name, Key='lanai/2462/2462.tif.csv')
        info_df = pd.read_csv(io.BytesIO(obj['Body'].read()),names=['img_name','x0','x1','y0','y1'],delimiter=';')
        
info_df

In [None]:
debris_json = filter_debris_per_islands(["lanai"])
df_with_debris=None

for debris in debris_json:
    target = find_min_xy(debris['geometry']['coordinates'][0])
    df_with_debris = info_df.filter("x0 < {} and y0 <{} and x1 > {} and y1 > {}".format(target[0],target[1],target[2],target[3]))
    if len(df_with_debris)>0:
        break
        
df_with_debris

In [None]:
# s3key_path_template = "{island}/{img_folder}/{img_key}"

# downloaded_tile = set()
# csv_rows = []

# print("Search for tile image based on annotation geojson")
# for tile in tiles_features:
#     x0,y0,x1,y1 = find_min_xy(tile['geometry']['coordinates'][0]) 
#     has_debris = False
    
#     for debris in geo_json_in_search:
#         target = find_min_xy(debris['geometry']['coordinates'][0])
#         if x0 < target[0] and y0 < target[1] and  x1 > target[2] and y1 > target[3]:
#             print("with debris--{} without debris--{}".format(len(tile_with_debris),len(tile_without_debris)),end='\r')
#             json = {'label': d['properties']['type'],
#              'annotation': target,
#              'image_s3':tile['properties']['s3_path'],
#              'unique_pt_id': debris['properties']['unique_pt_id']
#             }
#             tile_with_debris.append(json)
#             debris['s3_key']=tile['properties']['s3_path']
#             has_debris = True

#     if not has_debris:
#         tile_without_debris.add(tile['properties']['tile_name'])
#         if stop_at_reached_at_max_negative and len(tile_without_debris)>max_number_of_negative_images:
#             break

# print("with debris--{} without debris--{}".format(len(tile_with_debris),len(tile_without_debris)),end='\r')

In [None]:
# # for all other islands, it's already converted to JPG.
# # and the coordinates are stored in folder/csv

# islands = ['oahu','lanai','kauai','hawaii']
# islands = ['lanai']

# for i in islands:
#     resp = s3_client.list_objects_v2(Bucket = 'hawaii-marine-debris',
#                                      Prefix='{island}/lanai/596/'.format(island=i), 
#                                     ) 
#     for obj in resp['Contents']:
#         print(obj)


In [None]:

# # then find out if any of the images is overlapped with debris
# for tile in tiles_features:

#     #local file path
#     tmpfile = "{}/images/niihau/{}".format(root_folder, os.path.basename(key))

#     if tmpfile not in downloaded_tile:
#         s3_resource.Bucket(bucket_name).download_file(key, tmpfile)
#         downloaded_tile.add(tmpfile)
#     #make sure none of the debris overlap it
#     found_debris = False
    
#     for debris in geo_json_in_search:
#         target = find_min_xy(debris['geometry']['coordinates'][0])
        
#             x0,y0,x1,y1 = find_min_xy(tile['geometry']['coordinates'][0]) 
#             if x0 < target[0] and y0 < target[1] and  x1 > target[2] and y1 > target[3]:
#                 json = {'label': d['properties']['type'],
#                  'annotation': target,
#                  'image_s3':tile['properties']['s3_path'],
#                 }
#                 tile_with_debris.append(json)
#                 print("found--{}".format(len(tile_with_debris)),end='\r')
#                 found_debris = True
#                 break
#         if found_debris:
#             break


#         pixel_coordinates = get_pixel_coordinates(tmpfile, debris['annotation'])
#         jpg_path = convert_tif_to_jpeg(tmpfile)    
#         s3_key = s3key_path_template.format(os.path.basename(jpg_path))
#         print("Uploaded %s: %s -> %s -> %s" % (len(csv_rows),tmpfile, jpg_path, s3_key), end='\r')
#         if tmpfile not in downloaded_tile:
#             try:
#                 s3_client.head_object(
#                     Bucket=bucket_name,
#                     Key=s3_key
#                 )
#             except:
#                 upload_to_s3(jpg_path, s3_key)
#         csv_rows.append((s3_key,)+ pixel_coordinates+(debris['label'],))

In [None]:
niihau_tileindex.csv

In [None]:
s3_client.upload_file('annotated_marine_debris.csv', bucket_name, "niihau/annotations.csv")