# Crop ROI Labels

Use the labeling information to split into train/val/test and then assign the relevant label

Assume that a label exists in an image if it's at least 10 pixels away from the horizontal edges


In [6]:
import pandas as pd
import numpy as np
import os
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 

import matplotlib.pyplot as plt


pd.set_option('max_colwidth', -1)

# Load Reference CSVs

## Labels and Crops Info

In [7]:
SAGEMAKER_PATH = r'/home/ec2-user/SageMaker'
SPLIT_PATH = os.path.join(SAGEMAKER_PATH, 'classify-streetview', 'split-train-test')
MINI_PATH = os.path.join(SAGEMAKER_PATH, 'classify-streetview', 'mini-crops')

In [15]:
df_crops = pd.read_csv(os.path.join(MINI_PATH, 'Crops_with_Labels.csv'))
print(df_crops.shape)
print(df_crops.columns)
df_crops.head()

(4978, 36)
Index(['filename', 'file_size', 'file_attributes', 'region_count', 'region_id',
       'region_shape_attributes', 'region_attributes', 'img_id',
       'present_ramp', 'missing_ramp', 'obstacle', 'surface_prob',
       'no_sidewalk', 'null', 'sv_image_x', 'sv_image_y',
       'sv_image_y_bottom_origin', 'label_name', 'crop_num', 'corner_x',
       'corner_y', 'size', 'x_crop_left', 'x_crop_right', 'y_crop_top',
       'y_crop_bottom', 'margin', 'x_roi_left', 'x_roi_right', 'y_roi_top',
       'y_roi_bottom', 'xpt_minus_xleft', 'xright_minus_xpt', 'ypt_minus_ytop',
       'ybottom_minus_ypt', 'label_in_crop'],
      dtype='object')


Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes,img_id,present_ramp,missing_ramp,...,margin,x_roi_left,x_roi_right,y_roi_top,y_roi_bottom,xpt_minus_xleft,xright_minus_xpt,ypt_minus_ytop,ybottom_minus_ypt,label_in_crop
0,680_45.jpg,49558,{},3,0,"{""name"":""point"",""cx"":108,""cy"":389}","{""Present Curb Ramp"":""1\n""}",680,True,False,...,10,15,175,330,490,103,77,69,111,True
1,680_45.jpg,49558,{},3,1,"{""name"":""point"",""cx"":160,""cy"":389}","{""Present Curb Ramp"":""1""}",680,True,False,...,10,15,175,330,490,155,25,69,111,True
2,680_135.jpg,51194,{},6,0,"{""name"":""point"",""cx"":18,""cy"":475}","{""Present Curb Ramp"":""1""}",680,True,False,...,10,15,175,330,490,13,167,155,25,True
3,680_225.jpg,47450,{},4,0,"{""name"":""point"",""cx"":179,""cy"":411}","{""Surface Problem"":""1""}",680,False,False,...,10,15,175,330,490,174,6,91,89,True
4,1042_45.jpg,45436,{},3,1,"{""name"":""point"",""cx"":90,""cy"":380}","{""Obstacle"":""1""}",1042,False,False,...,10,15,175,330,490,85,95,60,120,True


In [16]:
df_crops['label_in_crop'].value_counts()

True    4978
Name: label_in_crop, dtype: int64

In [19]:
df_crops['heading'] = df_crops['filename'].str.extract('(.*)_(.*).jpg', expand = True)[1]

In [20]:
crops_cols = ['img_id', 'heading', 'crop_num', 'present_ramp', 'missing_ramp', 'label_name', 
              'sv_image_x', 'sv_image_y', 'xpt_minus_xleft', 'xright_minus_xpt', 'ypt_minus_ytop', 'ybottom_minus_ypt']
df_crops = df_crops[crops_cols]
df_crops.head()

Unnamed: 0,img_id,heading,crop_num,present_ramp,missing_ramp,label_name,sv_image_x,sv_image_y,xpt_minus_xleft,xright_minus_xpt,ypt_minus_ytop,ybottom_minus_ypt
0,680,45,A,True,False,Present Curb Ramp,108,389,103,77,69,111
1,680,45,A,True,False,Present Curb Ramp,160,389,155,25,69,111
2,680,135,A,True,False,Present Curb Ramp,18,475,13,167,155,25
3,680,225,A,False,False,Surface Problem,179,411,174,6,91,89
4,1042,45,A,False,False,Obstacle,90,380,85,95,60,120


In [29]:
# Add a column for jpg_name
df_crops['jpg_name'] = df_crops['img_id'].astype(str) + '_' + df_crops['heading'].astype(str) + '_' + df_crops['crop_num'].astype(str) + '.jpg'

In [30]:
df_crops['label_name'].value_counts()

Present Curb Ramp    3014
Surface Problem      774 
No Sidewalk          677 
Obstacle             261 
Missing Curb Ramp    206 
Occlusion            46  
Name: label_name, dtype: int64

### Determine if within 10px ROI

In [31]:
df_margin = pd.DataFrame({'crop_num': df_crops['crop_num'], 
                               'left_margin': np.where(df_crops['crop_num'] == 'A', 0, 10), 
                               'right_margin' : np.where(df_crops['crop_num'] == 'F', 0, 10)})
df_margin.head()

Unnamed: 0,crop_num,left_margin,right_margin
0,A,0,10
1,A,0,10
2,A,0,10
3,A,0,10
4,A,0,10


In [32]:
df_crops['in_10px_roi'] = (df_margin['left_margin'] <= df_crops['xpt_minus_xleft']) & (df_margin['right_margin'] <= df_crops['xright_minus_xpt'])
df_crops['in_10px_roi'].value_counts()

True     4506
False    472 
Name: in_10px_roi, dtype: int64

In [33]:
df_crops.head()

Unnamed: 0,img_id,heading,crop_num,present_ramp,missing_ramp,label_name,sv_image_x,sv_image_y,xpt_minus_xleft,xright_minus_xpt,ypt_minus_ytop,ybottom_minus_ypt,in_10px_roi,jpg_name
0,680,45,A,True,False,Present Curb Ramp,108,389,103,77,69,111,True,680_45_A.jpg
1,680,45,A,True,False,Present Curb Ramp,160,389,155,25,69,111,True,680_45_A.jpg
2,680,135,A,True,False,Present Curb Ramp,18,475,13,167,155,25,True,680_135_A.jpg
3,680,225,A,False,False,Surface Problem,179,411,174,6,91,89,False,680_225_A.jpg
4,1042,45,A,False,False,Obstacle,90,380,85,95,60,120,True,1042_45_A.jpg


In [34]:
df_crops.to_csv('labels_in_10px_roi.csv', index = False)

## Flatten to 1 row per jpg_name

In [46]:
df_crops_roi = df_crops.loc[df_crops['in_10px_roi']]
feature_cols = ['present_ramp', 'missing_ramp']
df_crops_roi_group = df_crops_roi.groupby(['jpg_name', 'img_id', 'heading', 'crop_num'])[feature_cols].sum()
df_crops_roi_group['total_count'] = df_crops_roi_group[feature_cols].sum(axis = 1)
df_crops_roi_group = df_crops_roi_group.reset_index()
df_crops_roi_group.head()

Unnamed: 0,jpg_name,img_id,heading,crop_num,present_ramp,missing_ramp,total_count
0,10007_225_C.jpg,10007,225,C,1.0,0.0,1.0
1,10007_225_D.jpg,10007,225,D,1.0,0.0,1.0
2,10007_315_B.jpg,10007,315,B,1.0,0.0,1.0
3,10007_315_C.jpg,10007,315,C,1.0,0.0,1.0
4,10007_45_E.jpg,10007,45,E,1.0,0.0,1.0


In [47]:
df_crops_roi_group.shape

(3641, 7)

In [48]:
df_crops_roi_group['total_count'].value_counts()

1.0    2474
0.0    949 
2.0    213 
3.0    5   
Name: total_count, dtype: int64

In [49]:
df_crops_roi_group['includes_both'] = (df_crops_roi_group['present_ramp'] > 0) & (df_crops_roi_group['missing_ramp'] > 0)
df_crops_roi_group['includes_both'].value_counts()

False    3600
True     41  
Name: includes_both, dtype: int64

In [50]:
df_crops_roi_group['present_ramp'].value_counts()

1.0    2377
0.0    1088
2.0    173 
3.0    3   
Name: present_ramp, dtype: int64

In [51]:
df_crops_roi_group['missing_ramp'].value_counts()

0.0    3461
1.0    177 
2.0    3   
Name: missing_ramp, dtype: int64

## Apply Labels Logic

In [84]:
true_missing_mask = (df_crops_roi_group['includes_both'] == False) & (df_crops_roi_group['missing_ramp'] > 0)
df_crops_roi_group['ground_truth'] = np.where(df_crops_roi_group['present_ramp'] > 0, 'present', '1_null')
df_crops_roi_group['ground_truth'] = np.where(df_crops_roi_group['present_ramp'] > 1, 'multiple_present', df_crops_roi_group['ground_truth'])
df_crops_roi_group['ground_truth'] = np.where(true_missing_mask, 'only_missing', df_crops_roi_group['ground_truth'])
df_crops_roi_group['ground_truth'].value_counts()

present             2377
1_null              949 
multiple_present    176 
only_missing        139 
Name: ground_truth, dtype: int64

In [85]:
df_crops_roi_group['img_id'].unique().shape

(506,)

In [86]:
df_crops_roi_group.dtypes

jpg_name         object 
img_id           int64  
heading          object 
crop_num         object 
present_ramp     float64
missing_ramp     float64
total_count      float64
includes_both    bool   
ground_truth     object 
dtype: object

In [87]:
df_crops_roi_group.columns

Index(['jpg_name', 'img_id', 'heading', 'crop_num', 'present_ramp',
       'missing_ramp', 'total_count', 'includes_both', 'ground_truth'],
      dtype='object')

### Create Null Columns

In [88]:
img_id_list = list(df_crops_roi_group['img_id'].unique())
heading_list = list(df_crops_roi_group['heading'].unique())
crop_num_list = list(df_crops_roi_group['crop_num'].unique())
df_mesh = pd.DataFrame(np.array(np.meshgrid(img_id_list, heading_list, crop_num_list, )).T.reshape(-1,3))
df_mesh.columns = ['img_id', 'heading', 'crop_num']
df_mesh['jpg_name'] = df_mesh['img_id'].astype(str) + '_' + df_mesh['heading'].astype(str) + '_' + df_mesh['crop_num'].astype(str) + '.jpg'
df_mesh['img_id'] = pd.to_numeric(df_mesh['img_id'], downcast = 'integer')
print(len(img_id_list) * len(heading_list) * len(crop_num_list))
print(df_mesh.shape)
df_mesh.head()

12144
(12144, 4)


Unnamed: 0,img_id,heading,crop_num,jpg_name
0,10007,225,C,10007_225_C.jpg
1,10007,315,C,10007_315_C.jpg
2,10007,45,C,10007_45_C.jpg
3,10007,135,C,10007_135_C.jpg
4,10013,225,C,10013_225_C.jpg


In [89]:
df_mesh.dtypes

img_id      int16 
heading     object
crop_num    object
jpg_name    object
dtype: object

In [90]:
on_cols = list(df_mesh.columns)
df_all_crops = df_mesh.merge(df_crops_roi_group, how = 'left', left_on = on_cols, right_on = on_cols)
# Fill NAs
counts_cols = ['present_ramp', 'missing_ramp', 'total_count']
df_all_crops[counts_cols] = df_all_crops[counts_cols].fillna(0)
df_all_crops['includes_both'] = df_all_crops['includes_both'].fillna(False)
df_all_crops['ground_truth'] = df_all_crops['ground_truth'].fillna('1_null')
df_all_crops.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present
2,10007,45,C,10007_45_C.jpg,0.0,0.0,0.0,False,1_null
3,10007,135,C,10007_135_C.jpg,0.0,0.0,0.0,False,1_null
4,10013,225,C,10013_225_C.jpg,0.0,0.0,0.0,False,1_null


In [91]:
df_all_crops['ground_truth'].value_counts()

1_null              9452
present             2377
multiple_present    176 
only_missing        139 
Name: ground_truth, dtype: int64

## Img_id class split

In [92]:
# Get class list
df_split = pd.read_csv(os.path.join(SPLIT_PATH, 'train-validation-test-imgid-list.csv'))
df_split = df_split[['img_id', 'train/val/test']]
print(df_split.shape)

df_split.head()

(480, 2)


Unnamed: 0,img_id,train/val/test
0,8475,validation
1,8503,test
2,8540,test
3,8650,validation
4,8780,validation


In [93]:
df_split['train/val/test'].value_counts()

train         356
validation    62 
test          62 
Name: train/val/test, dtype: int64

# Merge Ground Truth with train/val/test

In [94]:
df_merge = df_all_crops.merge(df_split, how = 'left', left_on = 'img_id', right_on = 'img_id')
df_merge.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present,train
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present,train
2,10007,45,C,10007_45_C.jpg,0.0,0.0,0.0,False,1_null,train
3,10007,135,C,10007_135_C.jpg,0.0,0.0,0.0,False,1_null,train
4,10013,225,C,10013_225_C.jpg,0.0,0.0,0.0,False,1_null,validation


In [95]:
df_merge['ground_truth'].value_counts()

1_null              9452
present             2377
multiple_present    176 
only_missing        139 
Name: ground_truth, dtype: int64

In [96]:
df_merge.groupby(['train/val/test', 'ground_truth'])['jpg_name'].count()

train/val/test  ground_truth    
test            1_null              1151
                multiple_present    21  
                only_missing        29  
                present             287 
train           1_null              6621
                multiple_present    136 
                only_missing        75  
                present             1712
validation      1_null              1152
                multiple_present    14  
                only_missing        25  
                present             297 
Name: jpg_name, dtype: int64

In [97]:
os.getcwd()

'/home/ec2-user/SageMaker/classify-streetview/mini-crops'

In [98]:
df_merge.to_csv('imgid_groundtruth_trainvaltest.csv', index = False)