In [1]:
import pandas as pd
import numpy as np
import os
import boto3
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 

from datetime import datetime

import matplotlib.pyplot as plt

pd.set_option('max_colwidth', -1)


In [2]:
import sys

sys.path.append("/home/ec2-user/SageMaker/classify-streetview/mini-crops")
import mini_utils

In [3]:
SAGEMAKER_REPO_PATH = r'/home/ec2-user/SageMaker/classify-streetview'
TO_MOVE_PATH = os.path.join(SAGEMAKER_REPO_PATH, 'mini-crops/20200413-groundtruth-images-split.csv')
df_all = pd.read_csv(TO_MOVE_PATH)
df_all.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10076,135,A,10076_135_A.jpg,0,0,0,False,1_null,train
1,10076,225,A,10076_225_A.jpg,1,0,1,False,present,train
2,10076,315,A,10076_315_A.jpg,0,1,1,False,only_missing,train
3,10076,45,A,10076_45_A.jpg,0,0,0,False,1_null,train
4,10076,135,B,10076_135_B.jpg,0,0,0,False,1_null,train


In [4]:
df_all.groupby(['train/val/test', 'ground_truth'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,ground_truth,jpg_name
0,test,1_null,1311
1,test,multiple_present,21
2,test,only_missing,31
3,test,present,293
4,train,1_null,6961
5,train,multiple_present,133
6,train,only_missing,78
7,train,present,1732
8,validation,1_null,1311
9,validation,multiple_present,14


In [5]:
ground_truth_dict = {'0_crossing' : ['only_missing', 'present', 'multiple_present'],
                     '1_null' : ['1_null']
}

In [8]:
df_train = df_all.loc[df_all['train/val/test'] == 'train']
df_train_sample = mini_utils.sample_rows(df_train, 3987, ground_truth_dict)
df_train_sample['folder_label'].value_counts()

0_crossing -> ['only_missing', 'present', 'multiple_present']
1_null -> ['1_null']


1_null        3987
0_crossing    1943
Name: folder_label, dtype: int64

In [9]:
df_val = df_all.loc[df_all['train/val/test'].isin(['validation', 'test'])]
df_val_sample = mini_utils.sample_rows(df_val, 3000, ground_truth_dict)
df_val_sample['train/val/test'] = 'valid'
df_val_sample['folder_label'].value_counts()

0_crossing -> ['only_missing', 'present', 'multiple_present']
1_null -> ['1_null']


1_null        2622
0_crossing    690 
Name: folder_label, dtype: int64

In [10]:
df_transfer = pd.concat([df_train_sample, df_val_sample])
print(df_transfer.shape)
df_transfer.head()

(9242, 11)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,folder_label
1,10076,225,A,10076_225_A.jpg,1,0,1,False,present,train,0_crossing
2,10076,315,A,10076_315_A.jpg,0,1,1,False,only_missing,train,0_crossing
5,10076,225,B,10076_225_B.jpg,1,0,1,False,present,train,0_crossing
6,10076,315,B,10076_315_B.jpg,1,1,2,True,present,train,0_crossing
10,10076,315,C,10076_315_C.jpg,2,0,2,False,multiple_present,train,0_crossing


In [11]:
df_transfer['train/val/test'].value_counts()

train    5930
valid    3312
Name: train/val/test, dtype: int64

# Specify the destination Path
* jpg_name
* train/val/test
* folder_label

In [12]:
df_transfer['original_image_path'] = 'mini-crops/' + df_transfer['jpg_name']
df_transfer['new_image_path'] = 'mini-crops-labeled/0413-crossing-null/' + df_transfer['train/val/test'] + '/' + df_transfer['folder_label'] + '/' + df_transfer['jpg_name']
show_transfer_cols = ['jpg_name', 'train/val/test', 'folder_label', 'original_image_path', 'new_image_path']
df_transfer[show_transfer_cols].head()

Unnamed: 0,jpg_name,train/val/test,folder_label,original_image_path,new_image_path
1,10076_225_A.jpg,train,0_crossing,mini-crops/10076_225_A.jpg,mini-crops-labeled/0413-crossing-null/train/0_crossing/10076_225_A.jpg
2,10076_315_A.jpg,train,0_crossing,mini-crops/10076_315_A.jpg,mini-crops-labeled/0413-crossing-null/train/0_crossing/10076_315_A.jpg
5,10076_225_B.jpg,train,0_crossing,mini-crops/10076_225_B.jpg,mini-crops-labeled/0413-crossing-null/train/0_crossing/10076_225_B.jpg
6,10076_315_B.jpg,train,0_crossing,mini-crops/10076_315_B.jpg,mini-crops-labeled/0413-crossing-null/train/0_crossing/10076_315_B.jpg
10,10076_315_C.jpg,train,0_crossing,mini-crops/10076_315_C.jpg,mini-crops-labeled/0413-crossing-null/train/0_crossing/10076_315_C.jpg


In [13]:
df_transfer = df_transfer.reset_index(drop = True)

In [14]:
# Move files from outer folder to gsv folder
bucket_name = 'gsv-crops2'
s3 = boto3.resource('s3')

start = datetime.now()
print(start)
cannot_move_paths = []
for index, row in df_transfer.iterrows():
    if index % 500 == 0:
        print(f'Through {index} images at {datetime.now()}')
    # Copy to gsv folder
    old_name = str(row['original_image_path'])
    new_name = str(row['new_image_path'])
    copy_source = {
        'Bucket': bucket_name,
        'Key': old_name
    }
    try:
        s3.meta.client.copy(copy_source, bucket_name, new_name)
    except:
        print(f'Could not move {old_name} at {datetime.now()}')
        cannot_move_paths.append(old_name)
end = datetime.now()
print(end)

2020-04-13 22:38:49.763479
Through 0 images at 2020-04-13 22:38:49.769642
Through 500 images at 2020-04-13 22:39:30.128833
Through 1000 images at 2020-04-13 22:40:10.967707
Through 1500 images at 2020-04-13 22:40:50.920723
Through 2000 images at 2020-04-13 22:41:30.417181
Through 2500 images at 2020-04-13 22:42:11.203686
Through 3000 images at 2020-04-13 22:42:51.537521
Through 3500 images at 2020-04-13 22:43:33.086069
Through 4000 images at 2020-04-13 22:44:13.665040
Through 4500 images at 2020-04-13 22:44:54.932127
Through 5000 images at 2020-04-13 22:45:37.681167
Through 5500 images at 2020-04-13 22:46:20.216155
Through 6000 images at 2020-04-13 22:47:00.428858
Through 6500 images at 2020-04-13 22:47:40.944976
Through 7000 images at 2020-04-13 22:48:22.407334
Through 7500 images at 2020-04-13 22:49:04.180303
Through 8000 images at 2020-04-13 22:49:44.945098
Through 8500 images at 2020-04-13 22:50:26.961839
Through 9000 images at 2020-04-13 22:51:08.066927
2020-04-13 22:51:27.559199


In [15]:
df_can_move = df_transfer.loc[~df_transfer['original_image_path'].isin(cannot_move_paths)]
df_can_move.to_csv('0413-crossing-null-crops-successful-move.csv', index = False)
df_can_move.groupby(['train/val/test', 'folder_label'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,folder_label,jpg_name
0,train,0_crossing,1943
1,train,1_null,3987
2,valid,0_crossing,690
3,valid,1_null,2622
