# Split into Train and Test Sets

For Obstacle and Surface Problem classes, do 50/50

For Curb Ramp Present, aim for 80/20

In [1]:
import pandas as pd
import os
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 

import matplotlib.pyplot as plt

pd.set_option('max_colwidth', -1)

# Load in Labels with Metadata

In [2]:
df = pd.read_csv('single_labels_with_metadata.csv')
print(df.shape)
df.head()

(2851, 22)


Unnamed: 0,filename,file_size,region_count,region_id,img_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,...,sv_image_y_bottom_origin,label_name,date,lat,long,pano_id,name,pano_yaw_deg,tilt_yaw_deg,tilt_pitch_deg
0,680_45.jpg,49558,3,0,680,True,False,False,False,False,...,251,Present Curb Ramp,2017-08,43.036286,-87.907481,a5Q6wg2P8gqUtZemvQJU9Q,680,239.89,-160.06,1.75
1,680_45.jpg,49558,3,1,680,True,False,False,False,False,...,251,Present Curb Ramp,2017-08,43.036286,-87.907481,a5Q6wg2P8gqUtZemvQJU9Q,680,239.89,-160.06,1.75
2,680_45.jpg,49558,3,2,680,True,False,False,False,False,...,187,Present Curb Ramp,2017-08,43.036286,-87.907481,a5Q6wg2P8gqUtZemvQJU9Q,680,239.89,-160.06,1.75
3,680_135.jpg,51194,6,0,680,True,False,False,False,False,...,165,Present Curb Ramp,2017-08,43.036286,-87.907481,a5Q6wg2P8gqUtZemvQJU9Q,680,239.89,-160.06,1.75
4,680_135.jpg,51194,6,1,680,True,False,False,False,False,...,227,Present Curb Ramp,2017-08,43.036286,-87.907481,a5Q6wg2P8gqUtZemvQJU9Q,680,239.89,-160.06,1.75


In [3]:
df['label_name'].value_counts()

Present Curb Ramp    1677
Surface Problem      448 
No Sidewalk          430 
Obstacle             147 
Missing Curb Ramp    124 
Occlusion            25  
Name: label_name, dtype: int64

## Work to split on Obstructions

In [4]:
df_ob = df.loc[df['label_name'].str.contains('Obstacle')]
df_ob.shape

(147, 22)

In [7]:
# Get number of panos
obstruction_img_ids = list(df_ob['img_id'].unique())
len(obstruction_img_ids)

79

## Split on Surface Problem

In [31]:
df_sp = df.loc[df['label_name'].str.contains('Surface')]
df_sp.shape

(448, 22)

In [32]:
# Get number of panos
surface_prob_img_ids = list(df_sp['img_id'].unique())
len(surface_prob_img_ids)

162

## Split on Surface Problem

In [34]:
df_miss = df.loc[df['label_name'].str.contains('Missing')]
df_miss.shape

(124, 22)

In [35]:
# Get number of pano
missing_img_ids = list(df_miss['img_id'].unique())
len(missing_img_ids)

79

## Try to get a groupby img_id

In [51]:
bool_cols = ['present_ramp', 'missing_ramp', 'obstacle', 'surface_prob', 'no_sidewalk']
count_cols = ['labels_count', 'count_present_ramp', 'count_missing_ramp', 'count_obstacle', 'count_surface_prob', 'count_no_sidewalk']
df_img_id = df.groupby(['img_id', 'pano_id'])[bool_cols].any()
df_img_id.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk
img_id,pano_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
141,pj93lAkGQiCKjuic_i9-9w,True,False,False,True,False
317,oOdRYFX_A61zMNPV_8x94A,False,False,False,True,False
352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True
510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True
583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True


In [27]:
df_img_id.shape

(506, 5)

In [28]:
df_img_id['labels_count'] = df.groupby(['img_id', 'pano_id'])['region_count'].count()
df_img_id.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count
img_id,pano_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
141,pj93lAkGQiCKjuic_i9-9w,True,False,False,True,False,4
317,oOdRYFX_A61zMNPV_8x94A,False,False,False,True,False,3
352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11
510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4
583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9


In [29]:
df_img_id_counts = df.groupby(['img_id', 'pano_id'])[bool_cols].sum()
df_img_id_counts = df_img_id_counts.add_prefix('count_')
df_img_id_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
img_id,pano_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
141,pj93lAkGQiCKjuic_i9-9w,1.0,0.0,0.0,3.0,0.0
317,oOdRYFX_A61zMNPV_8x94A,0.0,0.0,0.0,3.0,0.0
352,gmMcgskONK4C-kMOGpQfAw,0.0,0.0,0.0,0.0,11.0
510,2xZABXrvlRIsTW_lb-P-Mw,2.0,0.0,0.0,0.0,2.0
583,7Np-jziLBGYvL0jxef8n6Q,0.0,0.0,0.0,0.0,9.0


In [30]:
df_img = pd.merge(df_img_id, df_img_id_counts, left_index = True, right_index = True)
df_img.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
img_id,pano_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
141,pj93lAkGQiCKjuic_i9-9w,True,False,False,True,False,4,1.0,0.0,0.0,3.0,0.0
317,oOdRYFX_A61zMNPV_8x94A,False,False,False,True,False,3,0.0,0.0,0.0,3.0,0.0
352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0.0,0.0,0.0,0.0,11.0
510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2.0,0.0,0.0,0.0,2.0
583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0.0,0.0,0.0,0.0,9.0


In [66]:
df_img = df_img.reset_index()

In [68]:
df_img.head()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
0,141,pj93lAkGQiCKjuic_i9-9w,True,False,False,True,False,4,1.0,0.0,0.0,3.0,0.0
1,317,oOdRYFX_A61zMNPV_8x94A,False,False,False,True,False,3,0.0,0.0,0.0,3.0,0.0
2,352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0.0,0.0,0.0,0.0,11.0
3,510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2.0,0.0,0.0,0.0,2.0
4,583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0.0,0.0,0.0,0.0,9.0


# Attempt and Evaluate some Splits!

In [38]:
rare_mask = (df_img['missing_ramp'] | df_img['obstacle'] | df_img['surface_prob'])
df_img_rare = df_img.loc[rare_mask]
print(df_img_rare.shape)
df_img_rare.head()

(247, 11)


Unnamed: 0_level_0,Unnamed: 1_level_0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
img_id,pano_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
141,pj93lAkGQiCKjuic_i9-9w,True,False,False,True,False,4,1.0,0.0,0.0,3.0,0.0
317,oOdRYFX_A61zMNPV_8x94A,False,False,False,True,False,3,0.0,0.0,0.0,3.0,0.0
680,a5Q6wg2P8gqUtZemvQJU9Q,True,False,True,True,False,14,8.0,0.0,5.0,1.0,0.0
693,kUoO8nV-CPgiPvaxrn2hHQ,False,False,True,False,True,5,0.0,0.0,3.0,0.0,2.0
877,B5f1qLJdicDxK9tUibMCWQ,True,True,False,False,False,2,1.0,1.0,0.0,0.0,0.0


In [72]:
df_img_rare = df_img_rare.reset_index()

In [76]:
# Save all rare images 
# This does not include any split information 
df_img_rare.to_csv('data-split/all_rare_panos.csv', index = False)

In [44]:
df_img_rare[bool_cols].sum()

present_ramp    234
missing_ramp    79 
obstacle        79 
surface_prob    162
no_sidewalk     20 
dtype: int64

In [52]:
df_img_rare[count_cols].sum()

labels_count          1569.0
count_present_ramp    774.0 
count_missing_ramp    124.0 
count_obstacle        147.0 
count_surface_prob    448.0 
count_no_sidewalk     68.0  
dtype: float64

In [56]:
# Work on splitting with a random number generator 
# Try 40% 
df_img_rare_test = df_img_rare.sample(frac = 0.4)
print(df_img_rare_test.shape)
df_img_rare_test[bool_cols].sum()

(99, 11)


present_ramp    92
missing_ramp    39
obstacle        35
surface_prob    65
no_sidewalk     12
dtype: int64

In [57]:
df_img_rare_test[count_cols].sum()

labels_count          659.0
count_present_ramp    296.0
count_missing_ramp    60.0 
count_obstacle        70.0 
count_surface_prob    186.0
count_no_sidewalk     45.0 
dtype: float64

In [69]:
df_img_rare_test = df_img_rare_test.reset_index()
df_img_rare_test.head()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
0,1721,TTrcr2vyvXG-tU6nDXQYMA,True,True,False,False,False,6,4.0,2.0,0.0,0.0,0.0
1,7691,vCVdPFFfWqsgcykvOBsJyw,True,True,False,False,True,8,2.0,2.0,0.0,0.0,4.0
2,18256,KyhVdaL-L_100pvd4l9kWQ,True,True,False,True,False,7,5.0,1.0,0.0,1.0,0.0
3,15377,cadhUhY1pwCypZWMAjW_Nw,True,False,False,True,False,4,3.0,0.0,0.0,1.0,0.0
4,16005,m1LYfD-1coIJoVEWHzp5Dw,True,True,False,True,False,7,4.0,2.0,0.0,1.0,0.0


In [62]:
#df_img_rare_test.to_csv('data-split/rare_test_set_60-40.csv')

In [78]:
# Try to add train_test bool
test_rare_pano_id = list(df_img_rare_test['pano_id'])
is_test_pano = df_img_rare['pano_id'].isin(test_rare_pano_id)
is_test_pano.head()

0    True 
1    True 
2    False
3    True 
4    True 
Name: pano_id, dtype: bool

In [79]:
is_test_pano.value_counts()

False    148
True     99 
Name: pano_id, dtype: int64

In [81]:
df_img_rare['is_test_pano'] = is_test_pano

In [85]:
df_img_rare_train = df_img_rare.loc[~df_img_rare['is_test_pano']]
print(df_img_rare_train.shape)
print(df_img_rare_train[count_cols].sum())
print('\n')
print(df_img_rare_train[bool_cols].sum())

(148, 14)
labels_count          910.0
count_present_ramp    478.0
count_missing_ramp    64.0 
count_obstacle        77.0 
count_surface_prob    262.0
count_no_sidewalk     23.0 
dtype: float64


present_ramp    142
missing_ramp    40 
obstacle        44 
surface_prob    97 
no_sidewalk     8  
dtype: int64


In [90]:
# Save rare
df_img_rare_train.to_csv('data-split/rare_train_only_6040.csv', index = False)

In [86]:
df_img_rare.head()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano
0,141,pj93lAkGQiCKjuic_i9-9w,True,False,False,True,False,4,1.0,0.0,0.0,3.0,0.0,True
1,317,oOdRYFX_A61zMNPV_8x94A,False,False,False,True,False,3,0.0,0.0,0.0,3.0,0.0,True
2,680,a5Q6wg2P8gqUtZemvQJU9Q,True,False,True,True,False,14,8.0,0.0,5.0,1.0,0.0,False
3,693,kUoO8nV-CPgiPvaxrn2hHQ,False,False,True,False,True,5,0.0,0.0,3.0,0.0,2.0,True
4,877,B5f1qLJdicDxK9tUibMCWQ,True,True,False,False,False,2,1.0,1.0,0.0,0.0,0.0,True


In [91]:
# Save rare
df_img_rare.to_csv('data-split/all_rare_with_is_test_4060.csv', index = False)

In [87]:
df_img_rare['is_test_pano'].value_counts()

False    148
True     99 
Name: is_test_pano, dtype: int64

In [125]:
df_img_rare['includes_rare'] = True

# Select from Common
Aim for 51 out of 259 from the common group, so about 20% 

In [97]:
type(df_img)
df_img.shape

(506, 13)

In [98]:
type(rare_mask)
rare_mask.shape

(506,)

In [100]:
df_img_common = df_img.loc[~rare_mask.values]
print(df_img_common.shape)

print(df_img_common[bool_cols].sum())
print('\n')
print(df_img_common[count_cols].sum())

df_img_common.head()

(259, 13)
present_ramp    230
missing_ramp    0  
obstacle        0  
surface_prob    0  
no_sidewalk     43 
dtype: int64


labels_count          1282.0
count_present_ramp    903.0 
count_missing_ramp    0.0   
count_obstacle        0.0   
count_surface_prob    0.0   
count_no_sidewalk     362.0 
dtype: float64


Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
2,352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0.0,0.0,0.0,0.0,11.0
3,510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2.0,0.0,0.0,0.0,2.0
4,583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0.0,0.0,0.0,0.0,9.0
8,878,Rh2vW4whyJGdD6M9v47NcQ,True,False,False,False,True,13,2.0,0.0,0.0,0.0,11.0
9,944,WeDhagO9OpPej7YAOjAs_g,False,False,False,False,True,17,0.0,0.0,0.0,0.0,17.0


In [105]:
print(df_img_common[bool_cols].sum() * 0.2)
print('\n')
print(df_img_common[count_cols].sum() * 0.2)

present_ramp    46.0
missing_ramp    0.0 
obstacle        0.0 
surface_prob    0.0 
no_sidewalk     8.6 
dtype: float64


labels_count          256.4
count_present_ramp    180.6
count_missing_ramp    0.0  
count_obstacle        0.0  
count_surface_prob    0.0  
count_no_sidewalk     72.4 
dtype: float64


In [114]:
df_img_common_test = df_img_common.sample(n = 51)
print(df_img_common_test.shape)
print(df_img_common_test[bool_cols].sum())
print('\n')
print(df_img_common_test[count_cols].sum())

(51, 13)
present_ramp    44
missing_ramp    0 
obstacle        0 
surface_prob    0 
no_sidewalk     9 
dtype: int64


labels_count          253.0
count_present_ramp    171.0
count_missing_ramp    0.0  
count_obstacle        0.0  
count_surface_prob    0.0  
count_no_sidewalk     80.0 
dtype: float64


In [115]:
# Save the test image
df_img_common_test.head()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
467,18414,xHiTpiYUwVRdDGyOuFPZ5Q,False,False,False,False,True,3,0.0,0.0,0.0,0.0,3.0
142,6744,cRae615i9n7uKChYMx-9QA,False,False,False,False,True,16,0.0,0.0,0.0,0.0,16.0
49,3515,iPzFCH_Z60Y5yPPO07ivBw,True,False,False,False,False,1,1.0,0.0,0.0,0.0,0.0
332,14541,8Pk7xmRWYHqLawgpoiNQfA,True,False,False,False,False,8,8.0,0.0,0.0,0.0,0.0
425,17850,2tN_gV0xMKIxbIpIvUQapw,True,False,False,False,False,4,4.0,0.0,0.0,0.0,0.0


In [119]:
df_img_common_test.to_csv(os.path.join('data-split', 'common_label_test_8020.csv'), index = False)

In [121]:
test_common_pano_id = list(df_img_common_test['pano_id'])
is_common_test_pano = df_img_common['pano_id'].isin(test_common_pano_id)
is_common_test_pano.head()

2    True 
3    False
4    True 
8    False
9    True 
Name: pano_id, dtype: bool

In [122]:
is_common_test_pano.value_counts()

False    208
True     51 
Name: pano_id, dtype: int64

In [126]:
df_img_common['is_test_pano'] = is_common_test_pano
df_img_common['includes_rare'] = False
df_img_common.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano,includes_rare
2,352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0.0,0.0,0.0,0.0,11.0,True,False
3,510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2.0,0.0,0.0,0.0,2.0,False,False
4,583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0.0,0.0,0.0,0.0,9.0,True,False
8,878,Rh2vW4whyJGdD6M9v47NcQ,True,False,False,False,True,13,2.0,0.0,0.0,0.0,11.0,False,False
9,944,WeDhagO9OpPej7YAOjAs_g,False,False,False,False,True,17,0.0,0.0,0.0,0.0,17.0,True,False


In [127]:
df_img_common.to_csv(os.path.join('data-split', 'all_common_images_8020.csv'), index = False)

In [128]:
df_img_common_train = df_img_common.loc[~df_img_common['is_test_pano']]
df_img_common_train.to_csv(os.path.join('data-split', 'common_images_train_8020.csv'), index = False)

# Combine into final dataset

In [129]:
df_all = pd.concat([df_img_common, df_img_rare])
print(df_all.shape)
print(df_all['is_test_pano'].value_counts())

(506, 15)
False    356
True     150
Name: is_test_pano, dtype: int64


In [130]:
df_all.head(10)

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano,includes_rare
2,352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0.0,0.0,0.0,0.0,11.0,True,False
3,510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2.0,0.0,0.0,0.0,2.0,False,False
4,583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0.0,0.0,0.0,0.0,9.0,True,False
8,878,Rh2vW4whyJGdD6M9v47NcQ,True,False,False,False,True,13,2.0,0.0,0.0,0.0,11.0,False,False
9,944,WeDhagO9OpPej7YAOjAs_g,False,False,False,False,True,17,0.0,0.0,0.0,0.0,17.0,True,False
10,945,gjngo3EbUlFSO3opOqC8zQ,False,False,False,False,True,8,0.0,0.0,0.0,0.0,8.0,False,False
16,1525,0A5yRWWg4DIg3kZaLepjsw,True,False,False,False,False,2,2.0,0.0,0.0,0.0,0.0,False,False
17,1532,ZmeEkQGHC8kxylBXVIazrQ,True,False,False,False,True,3,2.0,0.0,0.0,0.0,1.0,False,False
21,1747,mMqZ40q_R_Cka8S7iBjgTg,True,False,False,False,False,4,4.0,0.0,0.0,0.0,0.0,False,False
23,1895,tHQSrR98q8Grd-hTylNXrQ,False,False,False,False,True,1,0.0,0.0,0.0,0.0,1.0,False,False


In [131]:
df_all.tail()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano,includes_rare
242,19034,mgQj-J_ddkIGAPDrIJP3_Q,True,False,True,False,False,7,5.0,0.0,2.0,0.0,0.0,False,True
243,19035,D5_Wkc16cqB4qOLKeV-YEw,True,True,False,False,False,6,4.0,2.0,0.0,0.0,0.0,True,True
244,19051,txd34VKSRyaTYESiQE3ixw,True,False,True,False,False,8,6.0,0.0,1.0,0.0,0.0,False,True
245,19078,PCMJyHSDc6RhyQlZzKDxTw,True,False,False,True,False,7,4.0,0.0,0.0,3.0,0.0,True,True
246,19102,Wc5RNm5tV84jCjAPHBvrHQ,True,True,False,False,False,6,4.0,2.0,0.0,0.0,0.0,True,True


In [133]:
# Cast count columns to integers
for col in count_cols:
    df_all[col] = pd.to_numeric(df_all[col], downcast = 'integer')
df_all.head()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano,includes_rare
2,352,gmMcgskONK4C-kMOGpQfAw,False,False,False,False,True,11,0,0,0,0,11,True,False
3,510,2xZABXrvlRIsTW_lb-P-Mw,True,False,False,False,True,4,2,0,0,0,2,False,False
4,583,7Np-jziLBGYvL0jxef8n6Q,False,False,False,False,True,9,0,0,0,0,9,True,False
8,878,Rh2vW4whyJGdD6M9v47NcQ,True,False,False,False,True,13,2,0,0,0,11,False,False
9,944,WeDhagO9OpPej7YAOjAs_g,False,False,False,False,True,17,0,0,0,0,17,True,False


In [134]:
df_all.tail()

Unnamed: 0,img_id,pano_id,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,is_test_pano,includes_rare
242,19034,mgQj-J_ddkIGAPDrIJP3_Q,True,False,True,False,False,7,5,0,2,0,0,False,True
243,19035,D5_Wkc16cqB4qOLKeV-YEw,True,True,False,False,False,6,4,2,0,0,0,True,True
244,19051,txd34VKSRyaTYESiQE3ixw,True,False,True,False,False,8,6,0,1,0,0,False,True
245,19078,PCMJyHSDc6RhyQlZzKDxTw,True,False,False,True,False,7,4,0,0,3,0,True,True
246,19102,Wc5RNm5tV84jCjAPHBvrHQ,True,True,False,False,False,6,4,2,0,0,0,True,True


In [136]:
print(df_all[bool_cols].sum())
print('\n')
print(df_all[count_cols].sum())

present_ramp    464
missing_ramp    79 
obstacle        79 
surface_prob    162
no_sidewalk     63 
dtype: int64


labels_count          2851
count_present_ramp    1677
count_missing_ramp    124 
count_obstacle        147 
count_surface_prob    448 
count_no_sidewalk     430 
dtype: int64


In [135]:
df_all.to_csv(os.path.join('data-split', 'final_combined_splits.csv'), index = False)

# Generate a Summary Table

In [140]:
total_pano_count = df_all[bool_cols].sum()
total_label_count = df_all[count_cols].sum()
test_pano_count = df_all[bool_cols].loc[df_all['is_test_pano']].sum()
test_label_count = df_all[count_cols].loc[df_all['is_test_pano']].sum()
train_pano_count = df_all[bool_cols].loc[~df_all['is_test_pano']].sum()
train_label_count = df_all[count_cols].loc[~df_all['is_test_pano']].sum()
print(total_pano_count)
print(total_label_count)
print(train_pano_count)
print(train_label_count)
print(test_pano_count)
print(test_label_count)

present_ramp    464
missing_ramp    79 
obstacle        79 
surface_prob    162
no_sidewalk     63 
dtype: int64
labels_count          2851
count_present_ramp    1677
count_missing_ramp    124 
count_obstacle        147 
count_surface_prob    448 
count_no_sidewalk     430 
dtype: int64
present_ramp    328
missing_ramp    40 
obstacle        44 
surface_prob    97 
no_sidewalk     42 
dtype: int64
labels_count          1939
count_present_ramp    1210
count_missing_ramp    64  
count_obstacle        77  
count_surface_prob    262 
count_no_sidewalk     305 
dtype: int64
present_ramp    136
missing_ramp    39 
obstacle        35 
surface_prob    65 
no_sidewalk     21 
dtype: int64
labels_count          912
count_present_ramp    467
count_missing_ramp    60 
count_obstacle        70 
count_surface_prob    186
count_no_sidewalk     125
dtype: int64


In [142]:

df_all_groupby = df_all.groupby(['is_test_pano'])[[*bool_cols, *count_cols]].sum()
df_all_groupby

Unnamed: 0_level_0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
is_test_pano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,328.0,40.0,44.0,97.0,42.0,1939.0,1210.0,64.0,77.0,262.0,305.0
True,136.0,39.0,35.0,65.0,21.0,912.0,467.0,60.0,70.0,186.0,125.0


In [159]:
combined_cols = [*bool_cols, *count_cols]

In [145]:
total_sum = df_all_groupby.sum()
total_sum

present_ramp          464.0 
missing_ramp          79.0  
obstacle              79.0  
surface_prob          162.0 
no_sidewalk           63.0  
labels_count          2851.0
count_present_ramp    1677.0
count_missing_ramp    124.0 
count_obstacle        147.0 
count_surface_prob    448.0 
count_no_sidewalk     430.0 
dtype: float64

In [149]:
df_all_groupby = df_all_groupby.append(total_sum, ignore_index = True)

In [150]:
df_all_groupby['grouping'] = pd.Series(['Train', 'Test', 'Combined'])
df_all_groupby.head()

Unnamed: 0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk,grouping
0,328.0,40.0,44.0,97.0,42.0,1939.0,1210.0,64.0,77.0,262.0,305.0,Train
1,136.0,39.0,35.0,65.0,21.0,912.0,467.0,60.0,70.0,186.0,125.0,Test
2,464.0,79.0,79.0,162.0,63.0,2851.0,1677.0,124.0,147.0,448.0,430.0,Combined


In [151]:
df_all_groupby_index = df_all_groupby.set_index('grouping')
df_all_groupby_index.head()

Unnamed: 0_level_0,present_ramp,missing_ramp,obstacle,surface_prob,no_sidewalk,labels_count,count_present_ramp,count_missing_ramp,count_obstacle,count_surface_prob,count_no_sidewalk
grouping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Train,328.0,40.0,44.0,97.0,42.0,1939.0,1210.0,64.0,77.0,262.0,305.0
Test,136.0,39.0,35.0,65.0,21.0,912.0,467.0,60.0,70.0,186.0,125.0
Combined,464.0,79.0,79.0,162.0,63.0,2851.0,1677.0,124.0,147.0,448.0,430.0


In [173]:
df_all_groupby_transpose = df_all_groupby_index.transpose()
#df_all_groupby_transpose = pd.to_numeric(df_all_groupby_transpose, downcast = 'integer')
df_all_groupby_transpose.to_csv(os.path.join('data-split', 'summary_split_transpose.csv'), float_format = '%.0f')
df_all_groupby_transpose

grouping,Train,Test,Combined
present_ramp,328.0,136.0,464.0
missing_ramp,40.0,39.0,79.0
obstacle,44.0,35.0,79.0
surface_prob,97.0,65.0,162.0
no_sidewalk,42.0,21.0,63.0
labels_count,1939.0,912.0,2851.0
count_present_ramp,1210.0,467.0,1677.0
count_missing_ramp,64.0,60.0,124.0
count_obstacle,77.0,70.0,147.0
count_surface_prob,262.0,186.0,448.0


In [163]:
df_all_groupby_unstack = df_all_groupby_index.unstack()
df_all_groupby_unstack = pd.to_numeric(df_all_groupby_unstack, downcast = 'integer')
df_all_groupby_unstack

                    grouping
present_ramp        Train       328 
                    Test        136 
                    Combined    464 
missing_ramp        Train       40  
                    Test        39  
                    Combined    79  
obstacle            Train       44  
                    Test        35  
                    Combined    79  
surface_prob        Train       97  
                    Test        65  
                    Combined    162 
no_sidewalk         Train       42  
                    Test        21  
                    Combined    63  
labels_count        Train       1939
                    Test        912 
                    Combined    2851
count_present_ramp  Train       1210
                    Test        467 
                    Combined    1677
count_missing_ramp  Train       64  
                    Test        60  
                    Combined    124 
count_obstacle      Train       77  
                    Test        70  
         

In [166]:
df_all_groupby_unstack.to_csv(os.path.join('data-split', 'summary_split_tall.csv'), header = False)

In [158]:
df_all_groupby_unstack.pivot(index = None, columns = 'grouping')
#(index='foo', columns='bar', values='baz')

AttributeError: 'Series' object has no attribute 'pivot'

In [155]:
df_all_groupby_pivot.index

MultiIndex(levels=[['present_ramp', 'missing_ramp', 'obstacle', 'surface_prob', 'no_sidewalk', 'labels_count', 'count_present_ramp', 'count_missing_ramp', 'count_obstacle', 'count_surface_prob', 'count_no_sidewalk'], ['Train', 'Test', 'Combined']],
           codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]],
           names=[None, 'grouping'])