# Frame as a 2nd Stage of Binary Classification - Missing vs Present
* The first model filters down to the "interesting" images - locations where we would expect there may be a cross walk
* Then this model can focus on differentiating between missing, present, or not sure
* Then humans can do some manual review of the images

In [3]:
import pandas as pd
import numpy as np
import os
import boto3
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 
import mini_utils
from datetime import datetime

import matplotlib.pyplot as plt


pd.set_option('max_colwidth', -1)

# Load Image CSV Info 

In [4]:
df_all = pd.read_csv('imgid_groundtruth_trainvaltest.csv')
df_all.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present,train
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present,train
2,10007,45,C,10007_45_C.jpg,0.0,0.0,0.0,False,,train
3,10007,135,C,10007_135_C.jpg,0.0,0.0,0.0,False,,train
4,10013,225,C,10013_225_C.jpg,0.0,0.0,0.0,False,,validation


In [5]:
df_all.groupby(['train/val/test', 'ground_truth'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,ground_truth,jpg_name
0,test,multiple_present,21
1,test,only_missing,29
2,test,present,287
3,train,multiple_present,136
4,train,only_missing,75
5,train,present,1712
6,validation,multiple_present,14
7,validation,only_missing,25
8,validation,present,297


In [6]:
ground_truth_dict = {'0_missing' : ['only_missing'],
                     '1_present' : ['present', 'multiple_present']                 
}

# List for Training

In [23]:
# Combining prior "test" set with training for more data
mask_train = (df_all['train/val/test'] == 'train') | (df_all['train/val/test'] == 'validation')
df_train = df_all.loc[mask_train]
df_train_sample = mini_utils.sample_rows(df_train, 500, ground_truth_dict)
df_train_sample['train/valid'] = 'train'
df_train_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_present -> ['present', 'multiple_present']


1_present    500
0_missing    100
Name: folder_label, dtype: int64

In [24]:
# Using about 15% of examples
df_val = df_all.loc[df_all['train/val/test'] == 'test']
df_val_sample = mini_utils.sample_rows(df_val, 400, ground_truth_dict)
df_val_sample['train/valid'] = 'valid'
df_val_sample['folder_label'].value_counts()

0_missing -> ['only_missing']
1_present -> ['present', 'multiple_present']


1_present    308
0_missing    29 
Name: folder_label, dtype: int64

In [25]:
df_transfer = pd.concat([df_train_sample, df_val_sample])
print(df_transfer.shape)
df_transfer.head()

(937, 12)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,folder_label,train/valid
209,12578,315,C,12578_315_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing,train
211,12578,135,C,12578_135_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing,train
479,16258,135,C,16258_135_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing,train
503,1663,135,C,1663_135_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing,train
808,18158,225,C,18158_225_C.jpg,0.0,1.0,1.0,False,only_missing,train,0_missing,train


In [26]:
df_transfer.groupby(['train/valid', 'folder_label'])['jpg_name'].count().reset_index()

Unnamed: 0,train/valid,folder_label,jpg_name
0,train,0_missing,100
1,train,1_present,500
2,valid,0_missing,29
3,valid,1_present,308


In [27]:
df_transfer.to_csv('missing-present-2split-img-details.csv', index = False)

# Specify the destination Path
* jpg_name
* train/val/test
* folder_label

In [29]:
df_transfer['original_image_path'] = 'mini-crops/' + df_transfer['jpg_name']
df_transfer['new_image_path'] = 'mini-crops-labeled/missing-present-2split/' + df_transfer['train/valid'] + '/' + df_transfer['folder_label'] + '/' + df_transfer['jpg_name']
show_transfer_cols = ['jpg_name', 'train/valid', 'folder_label', 'original_image_path', 'new_image_path']
df_transfer[show_transfer_cols].head()

Unnamed: 0,jpg_name,train/valid,folder_label,original_image_path,new_image_path
209,12578_315_C.jpg,train,0_missing,mini-crops/12578_315_C.jpg,mini-crops-labeled/missing-present-2split/train/0_missing/12578_315_C.jpg
211,12578_135_C.jpg,train,0_missing,mini-crops/12578_135_C.jpg,mini-crops-labeled/missing-present-2split/train/0_missing/12578_135_C.jpg
479,16258_135_C.jpg,train,0_missing,mini-crops/16258_135_C.jpg,mini-crops-labeled/missing-present-2split/train/0_missing/16258_135_C.jpg
503,1663_135_C.jpg,train,0_missing,mini-crops/1663_135_C.jpg,mini-crops-labeled/missing-present-2split/train/0_missing/1663_135_C.jpg
808,18158_225_C.jpg,train,0_missing,mini-crops/18158_225_C.jpg,mini-crops-labeled/missing-present-2split/train/0_missing/18158_225_C.jpg


In [30]:
df_transfer = df_transfer.reset_index(drop = True)

In [31]:
# Move files from outer folder to gsv folder
bucket_name = 'gsv-crops2'
s3 = boto3.resource('s3')

start = datetime.now()
print(start)
cannot_move_paths = []
for index, row in df_transfer.iterrows():
    if index % 500 == 0:
        print(f'Through {index} images at {datetime.now()}')
    # Copy to gsv folder
    old_name = str(row['original_image_path'])
    new_name = str(row['new_image_path'])
    copy_source = {
        'Bucket': bucket_name,
        'Key': old_name
    }
    try:
        s3.meta.client.copy(copy_source, bucket_name, new_name)
    except:
        print(f'Could not move {old_name} at {datetime.now()}')
        cannot_move_paths.append(old_name)
end = datetime.now()
print(end)

2020-04-10 22:01:29.101216
Through 0 images at 2020-04-10 22:01:29.102744
Could not move mini-crops/1205_135_A.jpg at 2020-04-10 22:01:44.444366
Could not move mini-crops/1195_45_C.jpg at 2020-04-10 22:01:45.649331
Could not move mini-crops/15629_225_E.jpg at 2020-04-10 22:01:50.038696
Could not move mini-crops/15629_225_B.jpg at 2020-04-10 22:01:51.427027
Could not move mini-crops/15629_315_A.jpg at 2020-04-10 22:01:53.197871
Could not move mini-crops/15629_315_F.jpg at 2020-04-10 22:01:55.851058
Could not move mini-crops/1205_135_B.jpg at 2020-04-10 22:01:59.147780
Could not move mini-crops/680_135_C.jpg at 2020-04-10 22:02:04.924548
Could not move mini-crops/15629_45_B.jpg at 2020-04-10 22:02:06.044799
Could not move mini-crops/15629_315_E.jpg at 2020-04-10 22:02:06.525505
Through 500 images at 2020-04-10 22:02:08.887854
Could not move mini-crops/680_45_A.jpg at 2020-04-10 22:02:09.228895
Could not move mini-crops/15619_315_D.jpg at 2020-04-10 22:02:09.698157
Could not move mini-cro

In [33]:
df_can_move = df_transfer.loc[~df_transfer['original_image_path'].isin(cannot_move_paths)]
df_can_move.to_csv('crossingnull-crops-successful-move.csv', index = False)
df_can_move.groupby(['train/valid', 'folder_label'])['jpg_name'].count().reset_index()

Unnamed: 0,train/valid,folder_label,jpg_name
0,train,0_missing,100
1,train,1_present,487
2,valid,0_missing,29
3,valid,1_present,308
