# Produce CSVs for new cities

Adapted from make_train_test_sets.ipynb

Use these CSVs with the CropMaker script to produce training and test sets

## March 2020 Updates
Modifications by Annie Lane

In [1]:
import os
os.getcwd()

'/home/ec2-user/SageMaker/classify-streetview/crop-images'

In [5]:
import sys
#sys.path.append('/mnt/c/Users/gweld/sidewalk/sidewalk_ml/')
sys.path.append('/home/ec2-user/SageMaker/sidewalk-cv-assets19')

import GSVutils.utils # Had to update line 384 of utils to remove error
from GSVutils.utils import GSV_IMAGE_WIDTH, GSV_IMAGE_HEIGHT
from GSVutils.point import Point
import numpy as np
import csv
from collections import defaultdict
import random

In [13]:
#path_to_db_export = 'newberg-labels.csv'
#path_to_db_export = 'seattle-labels.csv'
path_to_db_export = 'mke-labels.csv'

In [36]:
with open(path_to_db_export) as f:
    print f.next()
    print f.next()
    print f.next()

gsv_panorama_id,sv_image_x,sv_image_y,label_type_id,photagrapher_heading,heading,is_researcher

stxXyCKAbd73DmkM2vsIHA,9676,-455,1,50.4137916565,253.375,t

stxXyCKAbd73DmkM2vsIHA,8203,-355,1,50.4137916565,200,t



# We'll start by loading all features into objects associated with their parent panos

In [37]:
class Feat(object):
    def __init__(self, row):
        self.pano_id = row[0]
        self.sv_image_x = float(row[1])
        self.sv_image_y = float(row[2])
        self.label_type = int(row[3])
        self.photographer_heading = float(row[4]) if row[4] is not None else None
        self.heading = float(row[5]) if row[5] is not None else None
        self.is_researcher = row[6] == 't'  if row[6] is not None else None
        
    def to_row(self):
        row =[]
        row.append(self.pano_id)
        row.append(self.sv_image_x)
        row.append(self.sv_image_y)
        row.append(self.label_type)
        row.append(self.photographer_heading)
        row.append(self.heading)
        row.append(self.label_id)
        return row
    
    def point(self):
        return Point( self.sv_image_x, self.sv_image_y )
    
    def __str__(self):
        label = GSVutils.utils.label_from_int[self.label_type-1]
        return '{} at {}'.format(label, self.point() )
    
    @classmethod
    def header_row(cls):
        row = ['Pano ID','SV_x','SV_y','Label',
               'Photographer Heading','Heading','Label ID']
        return row

In [38]:
class Pano(object):
    
    def __init__(self):
        self.feats = {1:[], 2:[], 3:[], 4:[]}
        self.pano_id        = None
        self.photog_heading = None

    def add_feature(self, row):
        feat = Feat(row)
        if self.pano_id is None:
            self.pano_id = feat.pano_id
        assert self.pano_id == feat.pano_id
        
        if self.photog_heading is None:
            self.photog_heading = feat.photographer_heading
        
        self.feats[feat.label_type].append( feat )
            
    def __hash__(self):
        return hash( self.pano_id )
    
    def all_feats(self):
        ''' iterate over all features, regardless of type '''
        for label, features in self.feats.iteritems():
            for feature in features:
                yield feature
    
    def __str__(self):
        s = 'pano{}\n'.format(self.pano_id)
        for feat in self.all_feats():
            s += '{}\n'.format(feat)
        return s
    
    def __len__(self):
        ''' return the total number of feats in this pano '''
        c = 0
        for _ in self.all_feats():
            c += 1
        return c

In [39]:
counts = defaultdict(int)
panos = defaultdict( Pano )

with open(path_to_db_export) as dbfile:
    reader = csv.reader(dbfile)
    reader.next() # skip header
    
    for row in reader:
        pano_id = row[0]
        sv_image_x = float(row[1])
        sv_image_y = float(row[2])
        label_type = int(row[3])
        photographer_heading = float(row[4])
        heading = float(row[5])
        
        if len( pano_id ) < 2:
            continue
        
        if label_type in (1,2,3,4):
            # extract only ramp, missing ramps,
            # sfc probs, and obstructions
            panos[pano_id].add_feature( row )
            
            counts[label_type] += 1

print("Loaded features from {} panos".format( len(panos) )
for feature, count in counts.iteritems():
    name = GSVutils.utils.label_from_int[feature-1]
    print "{:<20}{}".format(name, count)

Loaded features from 1098 panos
Curb Cut            1063
Missing Cut         566
Obstruction         395
Sfc Problem         513


# Now we'll sliding window over the panos

and assign labels to them based on proximity to true features

In [40]:
def sliding_window(pano, stride=100, bottom_space=1600, side_space=300, cor_thresh=70):
    ''' take in a pano and produce a set of feats, ready for writing to a file
        labels assigned if the crop is within cor_thresh of a true label
        
        try cor_thresh = stride/sqrt(2)
    '''
    x, y = side_space, 0
    while(y > - (GSV_IMAGE_HEIGHT/2 - bottom_space)):
        while(x < GSV_IMAGE_WIDTH - side_space):
            # do things in one row
            
            # check if there's any features near this x,y point
            p = Point(x,y)
            
            label = 8 # for null
            for feat in pano.all_feats():
                if p.dist( feat.point() ) <= cor_thresh:
                    if label == 8:
                        label = feat.label_type
                    else:
                        if label != feat.label_type:
                            #print "Found conflicting labels, skipping."
                            continue
            row = [pano.pano_id, x, y, label, pano.photog_heading, None,None,None]
            yield Feat(row)
            
            x += stride
        y -= stride # jump down a row
        x = side_space

# We've got far too many nullcrops, now.

Let's throw out a bunch so we only have a few per pano

In [116]:
def cull_dataset_and_export_to_csv(dataset, file_to_write, nulls_per_pano=3):
    print "Computing sliding window for {} panos".format( len(dataset) )
    feats = defaultdict(set)
    
    panocount = 0 
    for pano in dataset:
        nulls = []
        for feat in sliding_window(pano):
            if feat.label_type != 8: feats[feat.label_type].add( feat )
            elif feat.label_type == 8:
                nulls.append(feat)
        nulls_to_keep = random.sample(nulls, nulls_per_pano)
        for feat in nulls_to_keep:
            feats[feat.label_type].add( feat )
        panocount += 1
                    
    print  '{:<18}{}'.format('Feature Type', 'Count')
    
    with open(file_to_write, 'w') as f:
        writer = csv.writer(f)
        writer.writerow( Feat.header_row() )
        for label, fs in feats.iteritems():
            label_t = GSVutils.utils.label_from_int[label-1] if label != 8 else 'Nullcrop'
            c = 0
            for feat in fs:
                writer.writerow( feat.to_row() )
                c += 1
            print '{:<18}{}'.format(label_t, c)
    print "Wrote features from {} panos to {}".format(panocount, file_to_write)

In [117]:

for name, dataset in datasets.iteritems():
    print 'Processing {} set'.format(name)
    cull_dataset_and_export_to_csv(dataset, 'dataset_csvs/{}.csv'.format(name))

Processing Test set
Computing sliding window for 5797 panos
Feature Type      Count
Nullcrop          17391
Curb Cut          19451
Missing Cut       2616
Obstruction       3314
Sfc Problem       1288
Wrote features from 5797 panos to dataset_csvs/Test.csv
Processing Train set
Computing sliding window for 46463 panos
Feature Type      Count
Nullcrop          139389
Curb Cut          159039
Missing Cut       21552
Obstruction       25162
Sfc Problem       9933
Wrote features from 46463 panos to dataset_csvs/Train.csv
Processing Val set
Computing sliding window for 5774 panos
Feature Type      Count
Nullcrop          17322
Curb Cut          19538
Missing Cut       2685
Obstruction       3153
Sfc Problem       1205
Wrote features from 5774 panos to dataset_csvs/Val.csv


# Now we can use set_cropper.py to write these to a directory