# Move Files 

In [45]:
import numpy as np
import pandas as pd
import os
from datetime import datetime

import shutil
import random

In [14]:
pd.set_option('max_colwidth', -1)

# Create list of current files

In [2]:
SAGEMAKER_REPO_PATH = r'/home/ec2-user/SageMaker/classify-streetview'
ORIGINAL_IMAGE_PATH = os.path.join(SAGEMAKER_REPO_PATH, 'images')
ORIGINAL_TRAIN_PATH = os.path.join(ORIGINAL_IMAGE_PATH, 'train')
os.listdir(ORIGINAL_TRAIN_PATH)

['3_present', '2_obstacle', '0_missing', '1_null', '4_surface_prob']

In [16]:
subset_list = ['train', 'valid', 'test']
# Include obstacle and surface prob in case we need to move those images
class_list = ['3_present', '0_missing', '1_null', '2_obstacle', '4_surface_prob']

In [22]:
original_df_list = []

In [23]:
# Get all existing jpgs with their detailed info
for split in subset_list:
    for class_name in class_list:
        full_folder_path = os.path.join(ORIGINAL_IMAGE_PATH, split, class_name)
        jpg_names = os.listdir(full_folder_path)
        df_part = pd.DataFrame({'jpg_name' : jpg_names, 'original_folder_path' : full_folder_path, 'original_group' : split, 'original_label' : class_name})
        original_df_list.append(df_part)

In [24]:
# Create a full list all files
df_original = pd.concat(original_df_list)
print(df_original.shape)
df_original.head()

(7455, 4)


Unnamed: 0,jpg_name,original_folder_path,original_group,original_label
0,18810_135_5.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/3_present,train,3_present
1,6343_315_3.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/3_present,train,3_present
2,18998_135_6.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/3_present,train,3_present
3,10039_45_6.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/3_present,train,3_present
4,4282_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/3_present,train,3_present


In [43]:
df_original.to_csv('March-SmartCrop-ImageList.csv', index = False)

In [31]:
df_original['original_label'].value_counts()

3_present         3175
1_null            3130
4_surface_prob    651 
2_obstacle        283 
0_missing         216 
Name: original_label, dtype: int64

## Get the New ROI Image Details

In [25]:
df_train = pd.read_csv('train_labels.csv')
df_train['new_group'] = 'train'
df_val = pd.read_csv('validation_labels.csv')
df_val['new_group'] = 'valid'
df_test = pd.read_csv('test_labels.csv')
df_test['new_group'] = 'test'

In [26]:
df_new_roi = pd.concat([df_train, df_val, df_test])
print(df_new_roi.shape)
df_new_roi.head()

(13440, 9)


Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,new_group
0,510,45,1,0,1,0,1,1_null,train
1,680,45,1,1,0,0,1,0_missing,train
2,878,45,1,0,1,0,1,1_null,train
3,945,45,1,0,1,0,1,1_null,train
4,1023,45,1,0,1,0,1,1_null,train


In [28]:
df_new_roi['jpg_name'] = df_new_roi['img_id'].astype(str) + '_' + df_new_roi['heading'].astype(str) + '_' +df_new_roi['crop_number'].astype(str) + '.jpg'
df_new_roi.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,new_group,jpg_name
0,510,45,1,0,1,0,1,1_null,train,510_45_1.jpg
1,680,45,1,1,0,0,1,0_missing,train,680_45_1.jpg
2,878,45,1,0,1,0,1,1_null,train,878_45_1.jpg
3,945,45,1,0,1,0,1,1_null,train,945_45_1.jpg
4,1023,45,1,0,1,0,1,1_null,train,1023_45_1.jpg


# Combine ROI Images with Original Image details

In [29]:
df_combine = df_new_roi.merge(df_original, how = 'outer', left_on = 'jpg_name', right_on = 'jpg_name')
print(df_combine.shape)
df_combine.head()

(13441, 13)


Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,new_group,jpg_name,original_folder_path,original_group,original_label
0,510.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,510_45_1.jpg,,,
1,680.0,45.0,1.0,1.0,0.0,0.0,1.0,0_missing,train,680_45_1.jpg,,,
2,878.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,878_45_1.jpg,,,
3,945.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,945_45_1.jpg,,,
4,1023.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1023_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/2_obstacle,train,2_obstacle


In [36]:
df_combine['crop_number'].value_counts(dropna = False)

7.0    1920
6.0    1920
5.0    1920
4.0    1920
3.0    1920
2.0    1920
1.0    1920
NaN    1   
Name: crop_number, dtype: int64

In [37]:
df_combine.loc[df_combine['crop_number'].isna()]

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,new_group,jpg_name,original_folder_path,original_group,original_label
13440,,,,,,,,,,.ipynb_checkpoints,/home/ec2-user/SageMaker/classify-streetview/images/train/0_missing,train,0_missing


In [30]:
df_combine['original_folder_path'].value_counts(dropna = False).head()

NaN                                                                         5986
/home/ec2-user/SageMaker/classify-streetview/images/train/3_present         2457
/home/ec2-user/SageMaker/classify-streetview/images/train/1_null            2410
/home/ec2-user/SageMaker/classify-streetview/images/train/4_surface_prob    401 
/home/ec2-user/SageMaker/classify-streetview/images/test/1_null             363 
Name: original_folder_path, dtype: int64

In [32]:
df_group_label = df_combine.groupby(['ground_truth', 'original_label'])['jpg_name'].count()
df_group_label

ground_truth  original_label
0_missing     0_missing         15  
              1_null            24  
              2_obstacle        7   
              3_present         113 
              4_surface_prob    3   
1_null        0_missing         59  
              1_null            2948
              2_obstacle        187 
              3_present         996 
              4_surface_prob    430 
2_present     0_missing         141 
              1_null            158 
              2_obstacle        89  
              3_present         2066
              4_surface_prob    218 
Name: jpg_name, dtype: int64

In [33]:
df_combine['jpg_name'].value_counts().describe()

count    13441.0
mean     1.0    
std      0.0    
min      1.0    
25%      1.0    
50%      1.0    
75%      1.0    
max      1.0    
Name: jpg_name, dtype: float64

# Observations
* There's exactly 1 row per jpg_name
* There's a row with ipynb_checkpoints, which is fine
* There are some lost images (mainly null) 
* The grouping by label showing how images move around into the new "ground_truth" 

# Create the list of files before and after locations

In [39]:
df_move = df_combine.dropna().copy()
df_move.shape

(7454, 13)

In [42]:
df_move['ground_truth'].value_counts()

1_null       4620
2_present    2672
0_missing    162 
Name: ground_truth, dtype: int64

In [56]:
df_move['new_group'].value_counts()

train    5554
test     958 
valid    942 
Name: new_group, dtype: int64

In [40]:
df_move.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,new_group,jpg_name,original_folder_path,original_group,original_label
4,1023.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1023_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/2_obstacle,train,2_obstacle
10,1636.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1636_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null
11,1663.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1663_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null
12,1747.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1747_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null
13,1894.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1894_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null


In [50]:
df_move['new_folder_path'] =  SAGEMAKER_REPO_PATH + '/roi-images/' + df_move['new_group'] + '/' + df_move['ground_truth'] 
df_move.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,new_group,jpg_name,original_folder_path,original_group,original_label,new_folder_path
4,1023.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1023_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/2_obstacle,train,2_obstacle,/home/ec2-user/SageMaker/classify-streetview/roi-images/train/1_null
10,1636.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1636_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null,/home/ec2-user/SageMaker/classify-streetview/roi-images/train/1_null
11,1663.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1663_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null,/home/ec2-user/SageMaker/classify-streetview/roi-images/train/1_null
12,1747.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1747_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null,/home/ec2-user/SageMaker/classify-streetview/roi-images/train/1_null
13,1894.0,45.0,1.0,0.0,1.0,0.0,1.0,1_null,train,1894_45_1.jpg,/home/ec2-user/SageMaker/classify-streetview/images/train/1_null,train,1_null,/home/ec2-user/SageMaker/classify-streetview/roi-images/train/1_null


In [51]:
df_move.to_csv('roi-images-sagemaker-paths.csv', index = False)

# Actually Copy the Images

In [54]:
# Make sure folders exst for all new folders
unique_new_folders = list(df_move['new_folder_path'].unique())
print(len(unique_new_folders))
for new_folder in unique_new_folders:
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)
        print(new_folder)

9
/home/ec2-user/SageMaker/classify-streetview/roi-images/train/1_null
/home/ec2-user/SageMaker/classify-streetview/roi-images/train/2_present
/home/ec2-user/SageMaker/classify-streetview/roi-images/train/0_missing
/home/ec2-user/SageMaker/classify-streetview/roi-images/valid/1_null
/home/ec2-user/SageMaker/classify-streetview/roi-images/valid/0_missing
/home/ec2-user/SageMaker/classify-streetview/roi-images/valid/2_present
/home/ec2-user/SageMaker/classify-streetview/roi-images/test/1_null
/home/ec2-user/SageMaker/classify-streetview/roi-images/test/2_present
/home/ec2-user/SageMaker/classify-streetview/roi-images/test/0_missing


In [55]:
for index, row in df_move.iterrows():
    original = os.path.join(row['original_folder_path'], row['jpg_name'])
    target = os.path.join(row['new_folder_path'], row['jpg_name'])
    try:
        shutil.copyfile(original, target)
    except:
        print(f"could not copy: {row['jpg_name']}")

# Make an alphabetical list of the test images

In [58]:
df_test = df_move.loc[df_move['new_group'] == 'test']
print(df_test.shape)
df_test.columns

(958, 14)


Index(['img_id', 'heading', 'crop_number', '0_missing', '1_null', '2_present',
       'count_all', 'ground_truth', 'new_group', 'jpg_name',
       'original_folder_path', 'original_group', 'original_label',
       'new_folder_path'],
      dtype='object')

In [60]:
keep_cols = ['img_id', 'heading', 'crop_number', '0_missing', '1_null', '2_present', 'count_all', 'ground_truth', 'jpg_name', 'new_folder_path']
df_test_keep = df_test[keep_cols].copy()
df_test_keep = df_test_keep.sort_values(['new_folder_path', 'jpg_name'])
df_test_keep.head()

Unnamed: 0,img_id,heading,crop_number,0_missing,1_null,2_present,count_all,ground_truth,jpg_name,new_folder_path
12742,10228.0,45.0,5.0,1.0,0.0,1.0,2.0,0_missing,10228_45_5.jpg,/home/ec2-user/SageMaker/classify-streetview/roi-images/test/0_missing
13118,12270.0,225.0,6.0,1.0,0.0,0.0,1.0,0_missing,12270_225_6.jpg,/home/ec2-user/SageMaker/classify-streetview/roi-images/test/0_missing
13088,14541.0,225.0,6.0,1.0,0.0,1.0,2.0,0_missing,14541_225_6.jpg,/home/ec2-user/SageMaker/classify-streetview/roi-images/test/0_missing
13399,16078.0,315.0,7.0,1.0,0.0,1.0,2.0,0_missing,16078_315_7.jpg,/home/ec2-user/SageMaker/classify-streetview/roi-images/test/0_missing
12037,16180.0,135.0,2.0,1.0,0.0,0.0,1.0,0_missing,16180_135_2.jpg,/home/ec2-user/SageMaker/classify-streetview/roi-images/test/0_missing


In [61]:
df_test_keep.to_csv('test_roi_image_locations_sorted.csv', index = False)