# Create list of Crops for Train/Test to distribute into folders

In [1]:
import pandas as pd
import numpy as np
import os
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 

#import matplotlib.pyplot as plt
import itertools # To create all combos of imgid, heading, crops

pd.set_option('max_colwidth', -1)

In [2]:
# Load the crops with labels table
LABEL_CROP_LIST_PATH = 'img_heading_crop_labelcounts.csv'
df_labels = pd.read_csv(LABEL_CROP_LIST_PATH)
print(df_labels.shape)
df_labels.head()

(5254, 10)


Unnamed: 0,img_id,heading,crop_number,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all
0,141,135,2,0,1,0,0,0,0,1
1,141,135,3,1,1,0,0,0,0,2
2,141,135,5,0,1,0,0,0,0,1
3,141,135,6,1,1,0,0,0,0,2
4,141,135,7,1,0,0,0,0,0,1


In [3]:
# Load in the split details
SAGEMAKER_PATH = r'/home/ec2-user/SageMaker'
SPLIT_PATH = os.path.join(SAGEMAKER_PATH, 'classify-streetview/split-train-test/data-split/final_combined_splits.csv')
df_split_raw = pd.read_csv(SPLIT_PATH)
print(df_split_raw.shape)
df_split_raw.head()

(506, 15)


Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano,includes_rare
0,352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0,0,0,0,11,True,False
1,510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2,0,0,0,2,False,False
2,583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0,0,0,0,9,True,False
3,878,Rh2vW4whyJGdD6M9v47NcQ,True,False,False,False,True,13,2,0,0,0,11,False,False
4,944,WeDhagO9OpPej7YAOjAs_g,False,False,False,False,True,17,0,0,0,0,17,True,False


In [4]:
# Reduce the number of columns in the train/test split table
keep_split_cols = ['img_id', 'pano_id', 'is_test_pano']
df_split = df_split_raw[keep_split_cols].copy()
df_split.head()

Unnamed: 0,img_id,pano_id,is_test_pano
0,352,gmMcgskONK4C-kMOGpQfAw,True
1,510,2xZABXrvlRIsTW_lb-P-Mw,False
2,583,7Np-jziLBGYvL0jxef8n6Q,True
3,878,Rh2vW4whyJGdD6M9v47NcQ,False
4,944,WeDhagO9OpPej7YAOjAs_g,True


## Merge Test/Train with the Crop label info

In [5]:
df = df_labels.merge(df_split, how = 'left', left_on = 'img_id', right_on = 'img_id')
print(df.shape)
df.head()

(5254, 12)


Unnamed: 0,img_id,heading,crop_number,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano
0,141,135,2,0,1,0,0,0,0,1,pj93lAkGQiCKjuic_i9-9w,True
1,141,135,3,1,1,0,0,0,0,2,pj93lAkGQiCKjuic_i9-9w,True
2,141,135,5,0,1,0,0,0,0,1,pj93lAkGQiCKjuic_i9-9w,True
3,141,135,6,1,1,0,0,0,0,2,pj93lAkGQiCKjuic_i9-9w,True
4,141,135,7,1,0,0,0,0,0,1,pj93lAkGQiCKjuic_i9-9w,True


In [6]:
df['is_test_pano'].value_counts(dropna = False)

False    3692
True     1562
Name: is_test_pano, dtype: int64

In [7]:
# Save to CSV
df.to_csv('crop_labels_train_test.csv', index = False)

In [10]:
# All img_id's appear at least once in this list
df['img_id'].unique().shape

(506,)

## Get Test list Only
* 150 img_ids x 4 heading images x 7 crops = 4200 images
* 1562 crops contain at least 1 label
* 2638 crops are null

In [14]:
# For a given img_id, there are 28 crops
df_test = df.loc[df['is_test_pano']]
df_test['img_id'].value_counts().describe()

count    150.000000
mean     10.413333 
std      4.308859  
min      1.000000  
25%      7.000000  
50%      10.000000 
75%      14.000000 
max      22.000000 
Name: img_id, dtype: float64

In [23]:
df_test.head()

Unnamed: 0,img_id,heading,crop_number,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano
0,141,135,2,0,1,0,0,0,0,1,pj93lAkGQiCKjuic_i9-9w,True
1,141,135,3,1,1,0,0,0,0,2,pj93lAkGQiCKjuic_i9-9w,True
2,141,135,5,0,1,0,0,0,0,1,pj93lAkGQiCKjuic_i9-9w,True
3,141,135,6,1,1,0,0,0,0,2,pj93lAkGQiCKjuic_i9-9w,True
4,141,135,7,1,0,0,0,0,0,1,pj93lAkGQiCKjuic_i9-9w,True


In [32]:
imgid_test = list(df_test['img_id'].unique())
headings = [45, 135, 225, 315]
crops = [1, 2, 3, 4, 5, 6, 7]

columns_vals_list = [imgid_test, headings, crops]

data = list(itertools.product(*columns_vals_list))
df_test_all = pd.DataFrame(data, columns=['img_id', 'heading', 'crop_number'])
print(df_test_all.shape)
on_cols = ['img_id', 'heading', 'crop_number']
df_test_all = df_test_all.merge(df_test, how = 'left', left_on = on_cols, right_on = on_cols)
df_test_all.head(15)                     

(4200, 3)


Unnamed: 0,img_id,heading,crop_number,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano
0,141,45,1,,,,,,,,,
1,141,45,2,,,,,,,,,
2,141,45,3,,,,,,,,,
3,141,45,4,,,,,,,,,
4,141,45,5,,,,,,,,,
5,141,45,6,,,,,,,,,
6,141,45,7,,,,,,,,,
7,141,135,1,,,,,,,,,
8,141,135,2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,pj93lAkGQiCKjuic_i9-9w,True
9,141,135,3,1.0,1.0,0.0,0.0,0.0,0.0,2.0,pj93lAkGQiCKjuic_i9-9w,True


In [33]:
# Confirm that 2638 images are null
df_test_all['is_test_pano'].value_counts(dropna = False)

NaN     2638
True    1562
Name: is_test_pano, dtype: int64

In [34]:
# Fill in the 1_null column with 1's  
df_test_all['1_null'] = np.where(df_test_all['is_test_pano'].isna(), 1, 0)
df_test_all.head(10)

Unnamed: 0,img_id,heading,crop_number,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano,1_null
0,141,45,1,,,,,,,,,,1
1,141,45,2,,,,,,,,,,1
2,141,45,3,,,,,,,,,,1
3,141,45,4,,,,,,,,,,1
4,141,45,5,,,,,,,,,,1
5,141,45,6,,,,,,,,,,1
6,141,45,7,,,,,,,,,,1
7,141,135,1,,,,,,,,,,1
8,141,135,2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,pj93lAkGQiCKjuic_i9-9w,True,0
9,141,135,3,1.0,1.0,0.0,0.0,0.0,0.0,2.0,pj93lAkGQiCKjuic_i9-9w,True,0


In [39]:
# Fill in the rest of the numeric columns with 0s
# Remove pano_id and is_test_pano columns 

feature_cols = ['0_missing', '1_null', '2_obstacle', '3_present', '4_surface_prob','5_nosidewalk', '6_occlusion', 'count_all']
id_cols = ['img_id', 'heading', 'crop_number']
df_test_all[feature_cols] = df_test_all[feature_cols].fillna(0)
df_test_all[feature_cols] = df_test_all[feature_cols].astype(int)
df_test_all = df_test_all[[*id_cols, *feature_cols]]
df_test_all.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_obstacle,3_present,4_surface_prob,5_nosidewalk,6_occlusion,count_all
0,141,45,1,0,1,0,0,0,0,0,0
1,141,45,2,0,1,0,0,0,0,0,0
2,141,45,3,0,1,0,0,0,0,0,0
3,141,45,4,0,1,0,0,0,0,0,0
4,141,45,5,0,1,0,0,0,0,0,0


In [40]:
df_test_all['count_all'].describe()

count    4200.000000
mean     0.522381   
std      0.808077   
min      0.000000   
25%      0.000000   
50%      0.000000   
75%      1.000000   
max      6.000000   
Name: count_all, dtype: float64

In [41]:
df_test_all.to_csv('test_crops.csv', index = False)

# Investigate missing images

In [44]:
df_test['img_id'].value_counts().head()

6744    22
4684    20
6746    20
944     19
2501    19
Name: img_id, dtype: int64

In [43]:
df_triple = df_test.loc[df_test['img_id'] < 1000]
df_triple['img_id'].unique()

array([141, 317, 352, 583, 693, 877, 944])