In [1]:
from __future__ import print_function

from osgeo import gdal
import rasterio
import pandas as pd
import os 
import boto3


### Prepare data
- download from s3. light preprocess

In [2]:

root_folder = os.getcwd()
root_folder = "/home/paperspace/data"
print(root_folder)

/home/paperspace/data


In [3]:
try:
    os.makedirs("{}/images/niihau".format(root_folder), 755)  
except OSError:
    print("folder already exists")

try:
    os.makedirs("{}/niihau".format(root_folder), 755)  
except OSError:
    print("folder already exists")


folder already exists
folder already exists


In [4]:
not_default_keys = True

import boto3 
boto3.setup_default_session(profile_name='hawaii')
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
version='v2'

bucket_name = 'hawaii-marine-debris'
file_path_template = "{}/jpg/{}"
downloaded_tile = set()
islands= ['niihau']#'lanai','bigisland',''
local_tmp_folder = "/tmp/{}"

annotatino_s3key_path = "{island}/annotations.csv"
geojson_keys = [
    "niihau/final_marine_debris_database_NI_UTM4N_boxes.geojson",
    "niihau/niihau_tileindex.geojson",    
]


s3_resource.Bucket(bucket_name).download_file("niihau/final_marine_debris_database_NI_UTM4N_boxes.geojson", "{}/final_marine_debris_database_NI_UTM4N_boxes.geojson".format(root_folder))
s3_resource.Bucket(bucket_name).download_file("niihau/niihau_tileindex.geojson", "{}/niihau_tileindex.geojson".format(root_folder))
s3_resource.Bucket(bucket_name).download_file("niihau/deduped_annotations.csv", "{}/niihau/annotations.csv".format(root_folder))



In [6]:

s3_resource.Bucket(bucket_name).download_file("niihau/negative_tiles.csv", "{}/negative_tiles.csv".format(root_folder))


In [9]:
negative_tiles_df = pd.read_csv("{}/negative_tiles.csv".format(root_folder))
negative_tiles_df[:5]

Unnamed: 0,s3_key,x0,y0,x1,y1,label
0,niihau/jpg/2_562_09_14.jpg,,,,,
1,niihau/jpg/1_1463_21_30.jpg,,,,,
2,niihau/jpg/1_361_29_20.jpg,,,,,
3,niihau/jpg/2_153_27_05.jpg,,,,,
4,niihau/jpg/1_521_30_19.jpg,,,,,


In [6]:
s3_resource.Bucket(bucket_name).download_file("niihau/niihau_tileindex.csv", "{}/niihau_tileindex.csv".format(root_folder))


In [7]:
tile_index_df = pd.read_csv("{}/niihau_tileindex.csv".format(root_folder))
len(tile_index_df)

116158

In [8]:
s3_resource.Bucket(bucket_name).download_file("niihau/deduped_annotations.csv", "{}/niihau/annotations.csv".format(root_folder))

existing_annotation_df = pd.read_csv("{}/niihau/annotations.csv".format(root_folder))
existing_annotation_df['file_path'] = existing_annotation_df['s3_key'].replace({'niihau/jpg': '{}/niihau/images'.format(root_folder)}, inplace=False, regex=True)
existing_annotation_df[:5]

Unnamed: 0,s3_key,x0,y0,x1,y1,label,file_path
0,niihau/jpg/1_1920_04_07.jpg,474,667,524,617,P,/home/paperspace/data//niihau/images/1_1920_04...
1,niihau/jpg/1_1920_06_12.jpg,940,393,990,343,B,/home/paperspace/data//niihau/images/1_1920_06...
2,niihau/jpg/1_1920_07_13.jpg,242,529,292,479,P,/home/paperspace/data//niihau/images/1_1920_07...
3,niihau/jpg/1_1920_07_12.jpg,638,801,738,701,N,/home/paperspace/data//niihau/images/1_1920_07...
4,niihau/jpg/1_1920_07_13.jpg,187,933,237,883,B,/home/paperspace/data//niihau/images/1_1920_07...


In [21]:
def download_from_s3(s3_key):
    basename = os.path.basename(s3_key)
    local_file_path = local_tmp_folder.format(basename)
    s3_resource.Bucket(bucket_name).download_file(s3_key, local_file_path)
    return local_file_path

def convert_tif_to_jpeg(tiff_image_path):
    src = rasterio.open(tiff_image_path)
    jpg_path = tiff_image_path.replace(".tif", ".jpg")
    data = src.read()
    profile = src.profile
    result = data.astype(rasterio.uint8, casting='unsafe', copy=False)
    profile.update(driver='jpeg')
    with rasterio.open(jpg_path, 'w', **profile) as dst:
        dst.write(result)
    return jpg_path
                
def upload_to_s3(local_file_path, s3_path):
    s3_client.upload_file(local_file_path, bucket_name, s3_path)

def find_min_xy(coordinates):
    x0 = min([i[0] for i in coordinates])
    x1 = max([i[0] for i in coordinates])
    y0 = min([i[1] for i in coordinates])
    y1 = max([i[1] for i in coordinates])
    return x0,y0,x1,y1


def get_pixel_coordinates(image_path, coordinates):
    driver = gdal.GetDriverByName('GTiff')
    dataset = gdal.Open(image_path)
    band = dataset.GetRasterBand(1)

    cols = dataset.RasterXSize
    rows = dataset.RasterYSize

    transform = dataset.GetGeoTransform()

    xOrigin = transform[0]
    yOrigin = transform[3]
    pixelWidth = transform[1]
    pixelHeight = -transform[5]

    data = band.ReadAsArray(0, 0, cols, rows)

    x0, y0, x1, y1 = coordinates
    x0p = int((x0 - xOrigin) / pixelWidth)
    y0p = int((yOrigin - y0 ) / pixelHeight)
    x1p = int((x1 - xOrigin) / pixelWidth)
    y1p = int((yOrigin - y1 ) / pixelHeight)
    return x0p, y0p, x1p, y1p

### Iterate through geojson to find out annotations and overlap between new generated tiles


In [10]:
#download all necessary geojson files:


In [11]:
import json
from pprint import pprint
#list of all debris annotated from UoH
with open("{}/final_marine_debris_database_NI_UTM4N_boxes.geojson".format(root_folder)) as f:
    geo_json = json.load(f)
len(geo_json['features'])
debris_boxes = geo_json['features']


with open("{}/niihau_tileindex.geojson".format(root_folder)) as f:
    tiles_geo_json = json.load(f)
len(tiles_geo_json['features'])
tiles_features = tiles_geo_json['features']


In [12]:
islands = set()
for d in debris_boxes:
    islands.add(d['properties']['island'])
print(islands)

{'NI'}


In [20]:
#filter out specific data that we are looking for

set_of_objs = set()
target_islands = ["NI"]
def filter_debris_per_islands(target_islands=[]):
    objects = ['T','V','N','B','M','P','C','L','F','W','O']
    geo_json_in_search = []
    
    for d in debris_boxes:
        if d['properties']['type'].upper() not in objects:
            print("unknown object {}".format(d['properties']['type']))
        if d['properties']['island'] in target_islands:
            d['properties']['type'] = d['properties']['type'].upper()
            geo_json_in_search.append(d)
        set_of_objs.add(d['properties']['type'])

    print("# of debris :{}".format(len(geo_json_in_search)))
    return geo_json_in_search
geo_json_in_search = filter_debris_per_islands(target_islands)
print(set_of_objs)
# Debris categories:
# B = Buoys and floats
# C = Cloth	
# F = Foam 
# L = Line (single pieces of rope, not net)
# M = Metal
# N = Net
# P = Plastic
# T = Tire
# W = Processed wood
# V = Vessel
# O = Other 

# of debris :7871
{'B', 'C', 'N', 'M', 'P', 'V', 'F', 'T', 'L', 'W', 'O'}


In [21]:
len(geo_json_in_search)

7871

In [22]:
geo_json_in_search[:1]

[{'type': 'Feature',
  'properties': {'unique_pt_id': 'NI-001-0001',
   'island': 'NI',
   'segment': 1,
   'pt_id': 1,
   'lat': 21.996464,
   'long': -160.061616,
   'type': 'P',
   'size': 1,
   'min_size_meters': 0.01,
   'max_size_meters': 0.5,
   'observer': 'AO',
   'comment': None,
   'max_size_s': '0.5'},
  'geometry': {'type': 'Polygon',
   'coordinates': [[[390411.34116614977, 2432815.3680491154],
     [390412.34116614977, 2432815.3680491154],
     [390412.34116614977, 2432816.3680491154],
     [390411.34116614977, 2432816.3680491154],
     [390411.34116614977, 2432815.3680491154]]]}}]

In [None]:

#find corresponding tile
tile_with_debris = []
tile_without_debris = set()
unique_debris_image_set = {}
max_number_of_negative_images = 10000
stop_at_reached_at_max_negative = False
print("searching {}".format(len(geo_json_in_search)))


bucket_name = 'hawaii-marine-debris'

s3key_path_template = "niihau/jpg/{}"

downloaded_tile = set()
csv_rows = []
unique_set = True

print("Search for tile image based on annotation geojson")
for idx, tile in enumerate(tiles_features):    
    x0,y0,x1,y1 = find_min_xy(tile['geometry']['coordinates'][0]) 
    has_debris = False
    pct = (float(idx)/total_tile)*100
    for debris in geo_json_in_search:
        target = find_min_xy(debris['geometry']['coordinates'][0])
        
        if x0 < target[0] and y0 < target[1] and  x1 > target[2] and y1 > target[3]:
            print("with debris--{0:} unique debris -- {1:} without debris--{2:} pct done -- {3:.2f}%"\
                  .format(len(tile_with_debris),len(unique_debris_image_set), len(tile_without_debris),pct),end='\r')
            json = {'label': debris['properties']['type'],
             'annotation': target,
             'image_s3':tile['properties']['s3_path'],
             'unique_pt_id': debris['properties']['unique_pt_id']
            }
            tile_with_debris.append(json)
            unique_debris_image_set[debris['properties']['unique_pt_id']]=tile['properties']['s3_path']
            debris['s3_key']=tile['properties']['s3_path']
            has_debris = True

    if not has_debris:
        tile_without_debris.add(tile['properties']['s3_path'])
        if stop_at_reached_at_max_negative and len(tile_without_debris)>max_number_of_negative_images:
            break
    if len(unique_debris_image_set)>=len(geo_json_in_search) and len(tile_without_debris)>max_number_of_negative_images:
        break

print("with debris--{0:} unique debris -- {1:} without debris--{2:} pct done -- {3:.2f}%"\
                  .format(len(tile_with_debris),len(unique_debris_image_set), len(tile_without_debris),pct),end='\r')#just find the first one for now. and let's think about how to deal with second images later.



searching 7871
Search for tile image based on annotation geojson
with debris--103 unique debris -- 79 without debris--5036 pct done -- 4.39%

In [163]:
print("with debris--{0:} unique debris -- {1:} without debris--{2:} pct done -- {3:.2f}%"\
                  .format(len(tile_with_debris),len(unique_debris_image_set), len(tile_without_debris),pct),end='\r')

with debris--10674 unique debris -- 7785 without debris--112934 pct done -- 100.00%

In [164]:
tiles_features[:1]

[{'type': 'Feature',
  'properties': {'tile_name': '1_10_07_01.tif',
   'directory': '1_10',
   's3_path': 'https://s3-us-west-2.amazonaws.com/hawaii-marine-debris/niihau/1_10/1_10_07_01.tif'},
  'geometry': {'type': 'Polygon',
   'coordinates': [[[375494.86, 2409243.12],
     [375514.86, 2409243.12],
     [375514.86, 2409223.12],
     [375494.86, 2409223.12],
     [375494.86, 2409243.12]]]}}]

In [165]:
# we don't need all of them.
import random
tile_without_debris_to_save = random.sample(tile_without_debris, 20000)

In [166]:
# len(csv_rows)# csv_rows = []


0

In [72]:
# # os.path.isfile
# csv_rows[:2]
# downloaded_tile=set()

In [23]:
for i, row in negative_tiles_df.iterrows():
    s3_key = row.s3_key
    jpg_path = "/home/paperspace/data/images/niihau/"+os.path.basename(row.s3_key)
    print(i, end='\r')
    try:
        s3_client.head_object(
            Bucket=bucket_name,
            Key=s3_key
        )
    except:
        upload_to_s3(jpg_path, s3_key)
        

12654

In [None]:
#for negative images

for tif_name in tile_without_debris_to_save:
    s3filepath = tif_name.replace('https://s3-us-west-2.amazonaws.com/hawaii-marine-debris/','')
    tmpfile = "{}/images/niihau/{}".format(root_folder, os.path.basename(s3filepath))

    if tmpfile not in downloaded_tile:
        s3_resource.Bucket(bucket_name).download_file(s3filepath, tmpfile)
        downloaded_tile.add(tmpfile)
    jpg_path = convert_tif_to_jpeg(tmpfile)    
    s3_key = s3key_path_template.format(os.path.basename(jpg_path))
    print("Uploaded %s: %s -> %s -> %s" % (len(csv_rows),tmpfile, jpg_path, s3_key), end='\r')
    #already converted and uploaded
    try:
        s3_client.head_object(
            Bucket=bucket_name,
            Key=s3_key
        )
    except:
        upload_to_s3(jpg_path, s3_key)
    csv_rows.append((s3_key,"","","","",""))
    

In [77]:
csv_rows = list(set(csv_rows))
len(csv_rows)

12655

In [79]:
import csv

with open('negative_tiles.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['s3_key','x0', 'y0','x1','y1', 'label'])
    for row in csv_rows:
        csv_out.writerow(row)

In [80]:
s3_client.upload_file('negative_tiles.csv', bucket_name, "niihau/negative_tiles_v2.csv")

In [167]:
debris_lookup = {}
debris_s3_lookup = {}
for debris in geo_json_in_search:
    debris_lookup[debris['properties']['unique_pt_id']] = debris['properties']
    if 's3_key' in debris:
        debris_s3_lookup[os.path.basename(debris['s3_key'])] = debris['properties']
# debris_s3_lookup

In [168]:
len(tile_with_debris)

10674

In [169]:

print("convert image to JPEG, and convert bounding box to pixels")

for idx, debris in enumerate(tile_with_debris):    
    s3filepath = debris['image_s3'].replace('https://s3-us-west-2.amazonaws.com/hawaii-marine-debris/','')
    tmpfile = "{}/images/niihau/{}".format(root_folder, os.path.basename(s3filepath))
    if not unique_debris_image_set[debris['unique_pt_id']] == debris['image_s3']:
        print('{} skip to avoid duplicate debris'.format(idx), end='\r')
        continue
        
    

    if not os.path.isfile(tmpfile) and tmpfile not in downloaded_tile:
        s3_resource.Bucket(bucket_name).download_file(s3filepath, tmpfile)
        downloaded_tile.add(tmpfile)

    pixel_coordinates = get_pixel_coordinates(tmpfile, debris['annotation'])
    jpg_path = tmpfile.replace(".tif", ".jpg")
    if not os.path.isfile(jpg_path):
        jpg_path = convert_tif_to_jpeg(tmpfile)    
        
    s3_key = s3key_path_template.format(os.path.basename(jpg_path))
    print("%s Uploaded %s: %s -> %s -> %s" % (idx, len(csv_rows),tmpfile, jpg_path, s3_key), end='\r')
    if not os.path.isfile(jpg_path) and (tmpfile not in downloaded_tile):
        try:
            s3_client.head_object(
                Bucket=bucket_name,
                Key=s3_key
            )
        except:
            upload_to_s3(jpg_path, s3_key)
    csv_rows.append((s3_key,)+ pixel_coordinates+(debris['label'],))
    #free up storage once it's uploaded
#     os.remove(jpg_path)
#     os.remove(tmpfile)

convert image to JPEG, and convert bounding box to pixels
10673 Uploaded 7784: /home/paperspace/data//images/niihau/2_940_37_04.tif -> /home/paperspace/data//images/niihau/2_940_37_04.jpg -> niihau/jpg/2_940_37_04.jpgpg

In [170]:
csv_rows = list(set(csv_rows))
len(csv_rows)

7754

In [171]:
import csv

with open('tiles_with_debris.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['s3_key','x0', 'y0','x1','y1', 'label'])
    for row in csv_rows:
        csv_out.writerow(row)


In [172]:
s3_client.upload_file('tiles_with_debris.csv', bucket_name, "niihau/tiles_with_debris.csv")


In [174]:
positive_df = pd.read_csv('tiles_with_debris.csv', encoding='utf-8')
negative_df = pd.read_csv('negative_tiles.csv', encoding='utf-8')


In [177]:
positive_df[:5]

Unnamed: 0,s3_key,x0,y0,x1,y1,label
0,niihau/jpg/2_720_03_01.jpg,308,524,358,474,F
1,niihau/jpg/2_1544_35_13.jpg,562,591,862,291,N
2,niihau/jpg/2_1699_22_04.jpg,215,730,265,680,B
3,niihau/jpg/1_1920_08_16.jpg,367,284,417,234,B
4,niihau/jpg/2_1189_39_18.jpg,773,545,823,495,B


In [180]:
all_tiles_df = positive_df.append(negative_df)
all_tiles_df.to_csv("all_tiles.csv", encoding='utf-8')


In [181]:
s3_client.upload_file('all_tiles.csv', bucket_name, "niihau/all_tiles.csv")


In [None]:
import boto

s3 = boto.connect_s3(profile_name='hawaii')
bucket = s3.get_bucket('hawaii-marine-debris')

for o in bucket.list(prefix='lanai/', delimiter='/'):
    print(o.name)

In [None]:
import io
info_df=None
tile_img_with_debris = []
tile_img_without_debris = []
for o in bucket.list(prefix='lanai/615/'):
    if o.name.endswith('.csv'):
        print(o.name)
        obj = s3_client.get_object(Bucket=bucket_name, Key='lanai/2462/2462.tif.csv')
        info_df = pd.read_csv(io.BytesIO(obj['Body'].read()),names=['img_name','x0','x1','y0','y1'],delimiter=';')
        
info_df

In [None]:
debris_json = filter_debris_per_islands(["lanai"])
df_with_debris=None

for debris in debris_json:
    target = find_min_xy(debris['geometry']['coordinates'][0])
    df_with_debris = info_df.filter("x0 < {} and y0 <{} and x1 > {} and y1 > {}".format(target[0],target[1],target[2],target[3]))
    if len(df_with_debris)>0:
        break
        
df_with_debris

In [None]:
s3_client.upload_file('annotated_marine_debris.csv', bucket_name, "niihau/annotations.csv")