# Determine which files we've excluded

We have 480 panos/intersections that we're working with 

We labeled 559 panos/intersections

some may have been removed as "highways" 

Which images are fully null that we're missing?

In [1]:
import pandas as pd
import numpy as np
import os
import boto3
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 
import mini_utils
from datetime import datetime

import matplotlib.pyplot as plt


pd.set_option('max_colwidth', -1)

# Get All Labeled Filenames

In [4]:
df_filenames = pd.read_csv('labeled-filenames.csv')
print(df_filenames.shape)
df_filenames.head()

(2233, 3)


Unnamed: 0,filenames,img_id,heading
0,10076_135.jpg,10076.0,135.0
1,10076_225.jpg,10076.0,225.0
2,10076_315.jpg,10076.0,315.0
3,10076_45.jpg,10076.0,45.0
4,1042_135.jpg,1042.0,135.0


# Get List of Files in S3

In [3]:
fs = s3fs.S3FileSystem()


s3_image_bucket = 's3://streetview-w210'
sample_images_dir = os.path.join(s3_image_bucket, 'gsv')

# See what is in the folder
s3_images_list = fs.ls(sample_images_dir)
# Includes json info
len(s3_images_list)

91457

In [10]:
s3_image_filenames = [os.path.basename(filename) for filename in s3_images_list if 'jpg' in filename]
s3_image_filenames[0:10]

['10000_135.jpg',
 '10000_225.jpg',
 '10000_315.jpg',
 '10000_45.jpg',
 '10001_135.jpg',
 '10001_225.jpg',
 '10001_315.jpg',
 '10001_45.jpg',
 '10002_135.jpg',
 '10002_225.jpg']

In [14]:
labeled_filenames = set(df_filenames['filenames'])
print(len(labeled_filenames))
new_set = labeled_filenames & set(s3_image_filenames)
len(new_set)

2233


2036

In [23]:
df_file_filter = df_filenames.loc[df_filenames['filenames'].isin(list(new_set))]
print(df_file_filter.shape)
df_file_filter.head()

(2036, 3)


Unnamed: 0,filenames,img_id,heading
0,10076_135.jpg,10076.0,135.0
1,10076_225.jpg,10076.0,225.0
2,10076_315.jpg,10076.0,315.0
3,10076_45.jpg,10076.0,45.0
12,12578_135.jpg,12578.0,135.0


In [28]:
img_id_list = list(df_file_filter['img_id'].unique())
heading_list = list(df_file_filter['heading'].unique())
crop_num_list = ['A', 'B', 'C', 'D', 'E', 'F']
df_mesh = pd.DataFrame(np.array(np.meshgrid(img_id_list, heading_list, crop_num_list, )).T.reshape(-1,3))
df_mesh.columns = ['img_id', 'heading', 'crop_num']
df_mesh['img_id'] = pd.to_numeric(df_mesh['img_id'], downcast = 'integer')
df_mesh['heading'] = pd.to_numeric(df_mesh['heading'], downcast = 'integer')
df_mesh['jpg_name'] = df_mesh['img_id'].astype(str) + '_' + df_mesh['heading'].astype(str) + '_' + df_mesh['crop_num'].astype(str) + '.jpg'
print(df_mesh.shape)
df_mesh.head()

(12216, 4)


Unnamed: 0,img_id,heading,crop_num,jpg_name
0,10076,135,A,10076_135_A.jpg
1,10076,225,A,10076_225_A.jpg
2,10076,315,A,10076_315_A.jpg
3,10076,45,A,10076_45_A.jpg
4,12578,135,A,12578_135_A.jpg


# Load in Dataset of Labels in Images

In [68]:
df_crops = pd.read_csv('imgid_groundtruth_trainvaltest.csv')
print(df_crops.shape)
df_crops['imgid_heading'] = df_crops['img_id'].astype(str) + '_' + df_crops['heading'].astype(str)
df_crops.head()

(12144, 10)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,imgid_heading
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present,train,10007_225
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present,train,10007_315
2,10007,45,C,10007_45_C.jpg,0.0,0.0,0.0,False,1_null,train,10007_45
3,10007,135,C,10007_135_C.jpg,0.0,0.0,0.0,False,1_null,train,10007_135
4,10013,225,C,10013_225_C.jpg,0.0,0.0,0.0,False,1_null,validation,10013_225


In [69]:
df_crops['img_id'].unique().shape

(506,)

In [70]:
df_crops['train/val/test'].value_counts(dropna = False)

train         8544
validation    1488
test          1488
NaN           624 
Name: train/val/test, dtype: int64

In [71]:
df_crop_nulls = df_crops.loc[df_crops['train/val/test'].isna()]
no_split = df_crop_nulls['img_id'].unique()
print(len(no_split))
for img_id in no_split:
    print(img_id)

26
141
15377
15775
16005
317
352
5171
5238
5356
5601
5623
5698
5832
583
5893
6052
6099
6349
6353
6461
6524
6653
6713
6744
877
944


# Get Full List

In [72]:
merge_cols = ['jpg_name', 'img_id', 'heading', 'crop_num']
df_combine = df_mesh.merge(df_crops, how = 'left', left_on = merge_cols, right_on = merge_cols)
print(df_combine.shape)
df_combine.head()

(12216, 11)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,imgid_heading
0,10076,135,A,10076_135_A.jpg,0.0,0.0,0.0,False,1_null,train,10076_135
1,10076,225,A,10076_225_A.jpg,1.0,0.0,1.0,False,present,train,10076_225
2,10076,315,A,10076_315_A.jpg,0.0,1.0,1.0,False,only_missing,train,10076_315
3,10076,45,A,10076_45_A.jpg,0.0,0.0,0.0,False,1_null,train,10076_45
4,12578,135,A,12578_135_A.jpg,0.0,0.0,0.0,False,1_null,train,12578_135


In [73]:
df_combine['ground_truth'].value_counts(dropna = False)

1_null              9007
present             2327
NaN                 576 
multiple_present    168 
only_missing        138 
Name: ground_truth, dtype: int64

In [74]:
df_combine['ground_truth'].value_counts().sum()

11640

In [75]:
df_combine['train/val/test'].value_counts(dropna = False)

train         8256
validation    1488
test          1488
NaN           984 
Name: train/val/test, dtype: int64

In [76]:
df_combine['train/val/test'].value_counts().sum()

11232

# Fill in Split details for fully Null Images

In [77]:
df_nulls = df_combine.loc[df_combine['ground_truth'].isna()]
no_split = df_nulls['img_id'].unique()
print(len(no_split))
for img_id in no_split:
    print(img_id)

24
14902
17014
17073
11245
11317
11354
14748
14836
18846
92
13564
13699
16628
16695
16759
17366
19251
19256
5293
18042
18324
4719
5606
8447


In [82]:
df_tvt_null = pd.read_csv('only_null_imgidheading_split.csv', sep = '\t')
print(df_tvt_null.shape)
df_tvt_null.head()

(41, 2)


Unnamed: 0,img_id,fillnull_train/val/test
0,14902,validation
1,17014,test
2,17073,train
3,11245,train
4,11317,train


In [83]:
df_combine_filling = df_combine.merge(df_tvt_null, how = 'outer', left_on = 'img_id', right_on = 'img_id')
df_combine_filling.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,imgid_heading,fillnull_train/val/test
0,10076,135,A,10076_135_A.jpg,0.0,0.0,0.0,False,1_null,train,10076_135,
1,10076,225,A,10076_225_A.jpg,1.0,0.0,1.0,False,present,train,10076_225,
2,10076,315,A,10076_315_A.jpg,0.0,1.0,1.0,False,only_missing,train,10076_315,
3,10076,45,A,10076_45_A.jpg,0.0,0.0,0.0,False,1_null,train,10076_45,
4,10076,135,B,10076_135_B.jpg,0.0,0.0,0.0,False,1_null,train,10076_135,


In [84]:
df_combine_filling['fillnull_train/val/test'].value_counts(dropna = False)

NaN           11232
train         648  
validation    168  
test          168  
Name: fillnull_train/val/test, dtype: int64

In [85]:
df_combine_filling.shape

(12216, 12)

In [86]:
df_combine_filling['train/val/test'] = df_combine_filling['train/val/test'].fillna(df_combine_filling['fillnull_train/val/test'])
df_combine_filling['present_ramp'] = df_combine_filling['present_ramp'].fillna(0)
df_combine_filling['missing_ramp'] = df_combine_filling['missing_ramp'].fillna(0)
df_combine_filling['total_count'] = df_combine_filling['total_count'].fillna(0)
df_combine_filling['includes_both'] = df_combine_filling['includes_both'].fillna(False)
df_combine_filling['ground_truth'] = df_combine_filling['ground_truth'].fillna('1_null')
df_combine_filling['train/val/test'].value_counts(dropna = False)

train         8904
validation    1656
test          1656
Name: train/val/test, dtype: int64

In [87]:
df_combine_filling.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,imgid_heading,fillnull_train/val/test
0,10076,135,A,10076_135_A.jpg,0.0,0.0,0.0,False,1_null,train,10076_135,
1,10076,225,A,10076_225_A.jpg,1.0,0.0,1.0,False,present,train,10076_225,
2,10076,315,A,10076_315_A.jpg,0.0,1.0,1.0,False,only_missing,train,10076_315,
3,10076,45,A,10076_45_A.jpg,0.0,0.0,0.0,False,1_null,train,10076_45,
4,10076,135,B,10076_135_B.jpg,0.0,0.0,0.0,False,1_null,train,10076_135,


In [88]:
df_combine_filling['ground_truth'].value_counts()

1_null              9583
present             2327
multiple_present    168 
only_missing        138 
Name: ground_truth, dtype: int64

In [89]:
final_columns = ['img_id', 'heading', 'crop_num', 'jpg_name', 'present_ramp', 'missing_ramp', 'total_count', 'includes_both', 'ground_truth', 'train/val/test']
df_final = df_combine_filling[final_columns].dropna(subset = ['train/val/test']).copy()
df_final['present_ramp'] = pd.to_numeric(df_final['present_ramp'], downcast = 'integer')
df_final['missing_ramp'] = pd.to_numeric(df_final['missing_ramp'], downcast = 'integer')
df_final['total_count'] = pd.to_numeric(df_final['total_count'], downcast = 'integer')

print(df_final.shape)
df_final.head()

(12216, 10)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10076,135,A,10076_135_A.jpg,0,0,0,False,1_null,train
1,10076,225,A,10076_225_A.jpg,1,0,1,False,present,train
2,10076,315,A,10076_315_A.jpg,0,1,1,False,only_missing,train
3,10076,45,A,10076_45_A.jpg,0,0,0,False,1_null,train
4,10076,135,B,10076_135_B.jpg,0,0,0,False,1_null,train


In [90]:
df_final.groupby(['train/val/test', 'ground_truth'])['jpg_name'].count()

train/val/test  ground_truth    
test            1_null              1311
                multiple_present    21  
                only_missing        31  
                present             293 
train           1_null              6961
                multiple_present    133 
                only_missing        78  
                present             1732
validation      1_null              1311
                multiple_present    14  
                only_missing        29  
                present             302 
Name: jpg_name, dtype: int64

In [91]:
df_final['img_id'].unique().shape

(509,)

In [92]:
df_final.to_csv('20200413-groundtruth-images-split.csv', index = False)