# Create list of Crops for Train to distribute into folders

In [1]:
import pandas as pd
import numpy as np
import os
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 

#import matplotlib.pyplot as plt
import itertools # To create all combos of imgid, heading, crops

pd.set_option('max_colwidth', -1)

# Create the Training list from Crop Labels Train Test

`crop_labels_train_test.csv` is output file from the notebook `2020-03-27-CropDistributionList.ipynb` 

In [4]:
# Save to CSV
df.to_csv('crop_labels_train_test.csv', index = False)
df['is_test_pano'].value_counts()

False    3692
True     1562
Name: is_test_pano, dtype: int64

## Get Train List Only 
* 356 img_ids x 4 heading images x 7 crops = 4200 images


In [6]:
# For a given img_id, there are 28 crops
df_train = df.loc[~df['is_test_pano']]
df_train['img_id'].value_counts().describe()

count    356.000000
mean     10.370787 
std      4.527555  
min      1.000000  
25%      7.000000  
50%      10.000000 
75%      13.000000 
max      23.000000 
Name: img_id, dtype: float64

In [7]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,img_id,heading,crop_number,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano
27,27,510,135,3,1,0,0,0,0,0,1,2xZABXrvlRIsTW_lb-P-Mw,False
28,28,510,135,6,1,0,0,0,0,0,1,2xZABXrvlRIsTW_lb-P-Mw,False
29,29,510,135,7,1,0,0,0,0,0,1,2xZABXrvlRIsTW_lb-P-Mw,False
30,30,510,225,1,0,0,0,0,0,1,1,2xZABXrvlRIsTW_lb-P-Mw,False
31,31,510,225,2,0,0,0,0,0,1,1,2xZABXrvlRIsTW_lb-P-Mw,False


In [8]:
imgid_test = list(df_train['img_id'].unique())
headings = [45, 135, 225, 315]
crops = [1, 2, 3, 4, 5, 6, 7]

columns_vals_list = [imgid_test, headings, crops]

data = list(itertools.product(*columns_vals_list))
df_train_all = pd.DataFrame(data, columns=['img_id', 'heading', 'crop_number'])
print(df_train_all.shape)
on_cols = ['img_id', 'heading', 'crop_number']
df_train_all = df_train_all.merge(df_train, how = 'left', left_on = on_cols, right_on = on_cols)
df_train_all.head(15)                     

(9968, 3)


Unnamed: 0.1,img_id,heading,crop_number,Unnamed: 0,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano
0,510,45,1,,,,,,,,,,
1,510,45,2,,,,,,,,,,
2,510,45,3,,,,,,,,,,
3,510,45,4,,,,,,,,,,
4,510,45,5,,,,,,,,,,
5,510,45,6,,,,,,,,,,
6,510,45,7,35.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2xZABXrvlRIsTW_lb-P-Mw,False
7,510,135,1,,,,,,,,,,
8,510,135,2,,,,,,,,,,
9,510,135,3,27.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2xZABXrvlRIsTW_lb-P-Mw,False


In [11]:
# Confirm that 6276 images are null
# And that 3692 images are Training Images with at least one label
df_train_all['is_test_pano'].value_counts(dropna = False)

NaN      6276
False    3692
Name: is_test_pano, dtype: int64

In [13]:
# Fill in the 1_null column with 1's  
df_train_all['1_null'] = np.where(df_train_all['is_test_pano'].isna(), 1, 0)
df_train_all.head(10)

Unnamed: 0.1,img_id,heading,crop_number,Unnamed: 0,3_present,4_surface_prob,2_obstacle,0_missing,6_occlusion,5_nosidewalk,count_all,pano_id,is_test_pano,1_null
0,510,45,1,,,,,,,,,,,1
1,510,45,2,,,,,,,,,,,1
2,510,45,3,,,,,,,,,,,1
3,510,45,4,,,,,,,,,,,1
4,510,45,5,,,,,,,,,,,1
5,510,45,6,,,,,,,,,,,1
6,510,45,7,35.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2xZABXrvlRIsTW_lb-P-Mw,False,0
7,510,135,1,,,,,,,,,,,1
8,510,135,2,,,,,,,,,,,1
9,510,135,3,27.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2xZABXrvlRIsTW_lb-P-Mw,False,0


In [15]:
# Fill in the rest of the numeric columns with 0s
# Remove pano_id and is_test_pano columns 

feature_cols = ['0_missing', '1_null', '2_obstacle', '3_present', '4_surface_prob','5_nosidewalk', '6_occlusion', 'count_all']
id_cols = ['img_id', 'heading', 'crop_number']
df_train_all[feature_cols] = df_train_all[feature_cols].fillna(0)
df_train_all[feature_cols] = df_train_all[feature_cols].astype(int)
df_train_all = df_train_all[[*id_cols, *feature_cols]]
df_train_all.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_obstacle,3_present,4_surface_prob,5_nosidewalk,6_occlusion,count_all
0,510,45,1,0,1,0,0,0,0,0,0
1,510,45,2,0,1,0,0,0,0,0,0
2,510,45,3,0,1,0,0,0,0,0,0
3,510,45,4,0,1,0,0,0,0,0,0
4,510,45,5,0,1,0,0,0,0,0,0


In [20]:
df_train_all['ground_truth'] = '1_null'
df_train_all['ground_truth'] = np.where(df_train_all['3_present'] > 0, '3_present', df_train_all['ground_truth'])
df_train_all['ground_truth'] = np.where(df_train_all['4_surface_prob'] > 0, '4_surface_prob', df_train_all['ground_truth'])
df_train_all['ground_truth'] = np.where(df_train_all['0_missing'] > 0, '0_missing', df_train_all['ground_truth'])
df_train_all['ground_truth'] = np.where(df_train_all['2_obstacle'] > 0, '2_obstacle', df_train_all['ground_truth'])
df_train_all['ground_truth'].value_counts(dropna = False)

1_null            6739
3_present         2507
4_surface_prob    412 
2_obstacle        176 
0_missing         134 
Name: ground_truth, dtype: int64

In [17]:
df_train_all.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_obstacle,3_present,4_surface_prob,5_nosidewalk,6_occlusion,count_all,ground_truth
0,510,45,1,0,1,0,0,0,0,0,0,1_null
1,510,45,2,0,1,0,0,0,0,0,0,1_null
2,510,45,3,0,1,0,0,0,0,0,0,1_null
3,510,45,4,0,1,0,0,0,0,0,0,1_null
4,510,45,5,0,1,0,0,0,0,0,0,1_null


In [19]:
df_train_all.to_csv('train_crops_allnulls.csv', index = False)

# Keep only 2500 null training crops

In [23]:
df_all_null = df_train_all.loc[df_train_all['ground_truth'].str.contains('1_null')]
print(df_all_null.shape)
df_null_drop = df_all_null.sample(n = 4239)
df_null_drop.shape

(6739, 12)


(4239, 12)

In [27]:
df_train_keep = df_train_all.drop(index = df_null_drop.index)
df_train_keep['ground_truth'].value_counts()

3_present         2507
1_null            2500
4_surface_prob    412 
2_obstacle        176 
0_missing         134 
Name: ground_truth, dtype: int64

In [28]:
df_train_keep.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_obstacle,3_present,4_surface_prob,5_nosidewalk,6_occlusion,count_all,ground_truth
3,510,45,4,0,1,0,0,0,0,0,0,1_null
6,510,45,7,0,0,0,1,0,0,0,1,3_present
9,510,135,3,0,0,0,1,0,0,0,1,3_present
12,510,135,6,0,0,0,1,0,0,0,1,3_present
13,510,135,7,0,0,0,1,0,0,0,1,3_present


In [29]:
df_train_keep.to_csv('train_crops.csv', index = False)