# Sample for a relatively balanced Data Set

* Keep all Test Images
* About 35 per class for validation
* About 100 per class for training

In [35]:
import pandas as pd
import numpy as np
import os
import boto3
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 
from datetime import datetime

import matplotlib.pyplot as plt


pd.set_option('max_colwidth', -1)

# Load Image CSV Info 

In [4]:
df_all = pd.read_csv('imgid_groundtruth_trainvaltest.csv')
df_all.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present,train
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present,train
2,10007,45,C,10007_45_C.jpg,0.0,0.0,0.0,False,1_null,train
3,10007,135,C,10007_135_C.jpg,0.0,0.0,0.0,False,1_null,train
4,10013,225,C,10013_225_C.jpg,0.0,0.0,0.0,False,1_null,validation


In [23]:
ground_truth_dict = {'0_missing' : ['only_missing'],
    '1_null' : ['1_null'],
    '2_present' : ['present', 'multiple_present']  
}

def sample_rows(df_split_set, num_examples, ground_truth_dict):
    dfs_list = []
    for key, gt_list in ground_truth_dict.items():
        print(f'{key} -> {gt_list}')
        df_part = df_split_set.loc[df_split_set['ground_truth'].isin(gt_list)]
        if num_examples > df_part.shape[0]:
            df_keep_part = df_part.copy()
        else:
            df_keep_part = df_part.sample(n = num_examples).copy()

        df_keep_part['folder_label'] = key
        dfs_list.append(df_keep_part)
    df_sample = pd.concat(dfs_list)
    return df_sample

# List for Training

In [24]:
df_train = df_all.loc[df_all['train/val/test'] == 'train']
df_train_sample = sample_rows(df_train, 100, ground_truth_dict)
df_train_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_null -> ['1_null']
2_present -> ['present', 'multiple_present']


2_present    100
1_null       100
0_missing    75 
Name: folder_label, dtype: int64

In [25]:
df_val = df_all.loc[df_all['train/val/test'] == 'validation']
df_val_sample = sample_rows(df_val, 30, ground_truth_dict)
df_val_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_null -> ['1_null']
2_present -> ['present', 'multiple_present']


2_present    30
1_null       30
0_missing    25
Name: folder_label, dtype: int64

In [26]:
df_test = df_all.loc[df_all['train/val/test'] == 'test']
df_test_sample = sample_rows(df_test, 500, ground_truth_dict)
df_test_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_null -> ['1_null']
2_present -> ['present', 'multiple_present']


1_null       500
2_present    308
0_missing    29 
Name: folder_label, dtype: int64

In [29]:
df_transfer = pd.concat([df_train_sample, df_val_sample, df_test_sample])
print(df_transfer.shape)
df_transfer.head()

(1197, 11)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,folder_label
209,12578,315,C,12578_315_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing
211,12578,135,C,12578_135_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing
479,16258,135,C,16258_135_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing
503,1663,135,C,1663_135_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing
808,18158,225,C,18158_225_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing


In [30]:
df_transfer.to_csv('balanced-split-img-details.csv', index = False)

# Specify the destination Path
* jpg_name
* train/val/test
* folder_label

In [42]:
df_transfer['original_image_path'] = 'mini-crops/' + df_transfer['jpg_name']
df_transfer['new_image_path'] = 'mini-crops-labeled/balanced/' + df_transfer['train/val/test'] + '/' + df_transfer['folder_label'] + '/' + df_transfer['jpg_name']
show_transfer_cols = ['jpg_name', 'train/val/test', 'folder_label', 'original_image_path', 'new_image_path']
df_transfer[show_transfer_cols].head()

Unnamed: 0,jpg_name,train/val/test,folder_label,original_image_path,new_image_path
209,12578_315_C.jpg,train,0_missing,mini-crops/12578_315_C.jpg,mini-crops-labeled/balanced/train/0_missing/12578_315_C.jpg
211,12578_135_C.jpg,train,0_missing,mini-crops/12578_135_C.jpg,mini-crops-labeled/balanced/train/0_missing/12578_135_C.jpg
479,16258_135_C.jpg,train,0_missing,mini-crops/16258_135_C.jpg,mini-crops-labeled/balanced/train/0_missing/16258_135_C.jpg
503,1663_135_C.jpg,train,0_missing,mini-crops/1663_135_C.jpg,mini-crops-labeled/balanced/train/0_missing/1663_135_C.jpg
808,18158_225_C.jpg,train,0_missing,mini-crops/18158_225_C.jpg,mini-crops-labeled/balanced/train/0_missing/18158_225_C.jpg


In [43]:
df_transfer.shape

(1197, 13)

In [None]:
# Move files from outer folder to gsv folder
bucket_name = 'gsv-crops2'
s3 = boto3.resource('s3')

start = datetime.now()
print(start)
cannot_move_paths = []
for index, row in df_transfer.iterrows():
    if index % 500 == 0:
        print(f'Through {index} images at {datetime.now()}')
    # Copy to gsv folder
    old_name = str(row['original_image_path'])
    new_name = str(row['new_image_path'])
    copy_source = {
        'Bucket': bucket_name,
        'Key': old_name
    }
    try:
        s3.meta.client.copy(copy_source, bucket_name, new_name)
    except:
        print(f'Could not move {old_name} at {datetime.now()}')
        cannot_move_paths.append(old_name)
end = datetime.now()
print(end)

2020-04-06 00:31:47.457935
Could not move mini-crops/1205_225_F.jpg at 2020-04-06 00:31:54.759166
Could not move mini-crops/945_45_B.jpg at 2020-04-06 00:31:54.898525
Could not move mini-crops/15629_45_F.jpg at 2020-04-06 00:31:55.665150
Could not move mini-crops/15976_315_F.jpg at 2020-04-06 00:31:56.185182
Could not move mini-crops/15629_225_F.jpg at 2020-04-06 00:32:02.610340
Could not move mini-crops/680_135_B.jpg at 2020-04-06 00:32:03.263815
Could not move mini-crops/15629_315_D.jpg at 2020-04-06 00:32:06.206032
Through 8000 images at 2020-04-06 00:32:09.000005
Through 2000 images at 2020-04-06 00:32:36.985590


In [None]:
cannot_move_paths

In [None]:
df_cannot_move = df_transfer.loc[df_transfer['original_image_path'].isin(cannot_move_paths)]
df_cannot_move

In [None]:
df_can_move = df_transfer.loc[~df_transfer['original_image_path'].isin(cannot_move_paths)]
df_can_move.to_csv('balanced-crops-successful-move.csv', index = False)