In [1]:
import pandas as pd
import numpy as np
import os
import boto3
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 

from datetime import datetime

import matplotlib.pyplot as plt

pd.set_option('max_colwidth', -1)


In [2]:
import sys

sys.path.append("/home/ec2-user/SageMaker/classify-streetview/mini-crops")
import mini_utils

In [6]:
SAGEMAKER_REPO_PATH = r'/home/ec2-user/SageMaker/classify-streetview'
TO_MOVE_PATH = os.path.join(SAGEMAKER_REPO_PATH, 'mini-crops/20200413-groundtruth-images-split.csv')
df_all = pd.read_csv(TO_MOVE_PATH)
df_all.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10076,135,A,10076_135_A.jpg,0,0,0,False,1_null,train
1,10076,225,A,10076_225_A.jpg,1,0,1,False,present,train
2,10076,315,A,10076_315_A.jpg,0,1,1,False,only_missing,train
3,10076,45,A,10076_45_A.jpg,0,0,0,False,1_null,train
4,10076,135,B,10076_135_B.jpg,0,0,0,False,1_null,train


In [7]:
df_all.groupby(['train/val/test', 'ground_truth'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,ground_truth,jpg_name
0,test,1_null,1311
1,test,multiple_present,21
2,test,only_missing,31
3,test,present,293
4,train,1_null,6961
5,train,multiple_present,133
6,train,only_missing,78
7,train,present,1732
8,validation,1_null,1311
9,validation,multiple_present,14


In [12]:
ground_truth_dict = {'0_missing' : ['only_missing'],
                     '1_null' : ['1_null'],
                     '2_present' : ['present', 'multiple_present']
}

In [13]:
df_train = df_all.loc[df_all['train/val/test'] == 'train']
df_train_sample = mini_utils.sample_rows(df_train, 2000, ground_truth_dict)
df_train_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_null -> ['1_null']
2_present -> ['present', 'multiple_present']


1_null       2000
2_present    1865
0_missing    78  
Name: folder_label, dtype: int64

In [14]:
df_val = df_all.loc[df_all['train/val/test'].isin(['validation', 'test'])]
df_val_sample = mini_utils.sample_rows(df_val, 3000, ground_truth_dict)
df_val_sample['train/val/test'] = 'valid'
df_val_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_null -> ['1_null']
2_present -> ['present', 'multiple_present']


1_null       2622
2_present    630 
0_missing    60  
Name: folder_label, dtype: int64

In [16]:
df_transfer = pd.concat([df_train_sample, df_val_sample])
print(df_transfer.shape)
df_transfer.head()

(7255, 11)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,folder_label
2,10076,315,A,10076_315_A.jpg,0,1,1,False,only_missing,train,0_missing
14,10076,315,D,10076_315_D.jpg,0,1,1,False,only_missing,train,0_missing
18,10076,315,E,10076_315_E.jpg,0,1,1,False,only_missing,train,0_missing
32,12578,135,C,12578_135_C.jpg,0,1,1,False,only_missing,train,0_missing
34,12578,315,C,12578_315_C.jpg,0,1,1,False,only_missing,train,0_missing


In [17]:
df_transfer['train/val/test'].value_counts()

train    3943
valid    3312
Name: train/val/test, dtype: int64

# Specify the destination Path
* jpg_name
* train/val/test
* folder_label

In [18]:
df_transfer['original_image_path'] = 'mini-crops/' + df_transfer['jpg_name']
df_transfer['new_image_path'] = 'mini-crops-labeled/project-sidewalk/' + df_transfer['train/val/test'] + '/' + df_transfer['folder_label'] + '/' + df_transfer['jpg_name']
show_transfer_cols = ['jpg_name', 'train/val/test', 'folder_label', 'original_image_path', 'new_image_path']
df_transfer[show_transfer_cols].head()

Unnamed: 0,jpg_name,train/val/test,folder_label,original_image_path,new_image_path
2,10076_315_A.jpg,train,0_missing,mini-crops/10076_315_A.jpg,mini-crops-labeled/project-sidewalk/train/0_missing/10076_315_A.jpg
14,10076_315_D.jpg,train,0_missing,mini-crops/10076_315_D.jpg,mini-crops-labeled/project-sidewalk/train/0_missing/10076_315_D.jpg
18,10076_315_E.jpg,train,0_missing,mini-crops/10076_315_E.jpg,mini-crops-labeled/project-sidewalk/train/0_missing/10076_315_E.jpg
32,12578_135_C.jpg,train,0_missing,mini-crops/12578_135_C.jpg,mini-crops-labeled/project-sidewalk/train/0_missing/12578_135_C.jpg
34,12578_315_C.jpg,train,0_missing,mini-crops/12578_315_C.jpg,mini-crops-labeled/project-sidewalk/train/0_missing/12578_315_C.jpg


In [19]:
df_transfer = df_transfer.reset_index(drop = True)

In [20]:
# Move files from outer folder to gsv folder
bucket_name = 'gsv-crops2'
s3 = boto3.resource('s3')

start = datetime.now()
print(start)
cannot_move_paths = []
for index, row in df_transfer.iterrows():
    if index % 500 == 0:
        print(f'Through {index} images at {datetime.now()}')
    # Copy to gsv folder
    old_name = str(row['original_image_path'])
    new_name = str(row['new_image_path'])
    copy_source = {
        'Bucket': bucket_name,
        'Key': old_name
    }
    try:
        s3.meta.client.copy(copy_source, bucket_name, new_name)
    except:
        print(f'Could not move {old_name} at {datetime.now()}')
        cannot_move_paths.append(old_name)
end = datetime.now()
print(end)

2020-04-13 21:28:54.913241
Through 0 images at 2020-04-13 21:28:54.918193
Through 500 images at 2020-04-13 21:29:36.162900
Through 1000 images at 2020-04-13 21:30:18.102969
Through 1500 images at 2020-04-13 21:30:59.791358
Through 2000 images at 2020-04-13 21:31:40.739078
Through 2500 images at 2020-04-13 21:32:22.923696
Through 3000 images at 2020-04-13 21:33:04.670081
Through 3500 images at 2020-04-13 21:33:45.308697
Through 4000 images at 2020-04-13 21:34:27.741382
Through 4500 images at 2020-04-13 21:35:09.820997
Through 5000 images at 2020-04-13 21:35:50.967847
Through 5500 images at 2020-04-13 21:36:32.217871
Through 6000 images at 2020-04-13 21:37:14.870121
Through 6500 images at 2020-04-13 21:37:56.492761
Through 7000 images at 2020-04-13 21:38:38.536993
2020-04-13 21:38:59.984937


In [21]:
df_can_move = df_transfer.loc[~df_transfer['original_image_path'].isin(cannot_move_paths)]
df_can_move.to_csv('projectsidewalk-crops-successful-move.csv', index = False)
df_can_move.groupby(['train/val/test', 'folder_label'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,folder_label,jpg_name
0,train,0_missing,78
1,train,1_null,2000
2,train,2_present,1865
3,valid,0_missing,60
4,valid,1_null,2622
5,valid,2_present,630
