# Frame as a Binary Classification - Crossing vs Null

* Idea is that we can curate images that contain something "interesting" 
* Then we can try another model that chooses between missing vs present
* Alternatively, we have humans work

In [1]:
import pandas as pd
import numpy as np
import os
import boto3
import s3fs # for reading from S3FileSystem
import json # for working with JSON files 
import mini_utils
from datetime import datetime

import matplotlib.pyplot as plt


pd.set_option('max_colwidth', -1)

# Load Image CSV Info 

In [2]:
df_all = pd.read_csv('imgid_groundtruth_trainvaltest.csv')
df_all.head()

Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present,train
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present,train
2,10007,45,C,10007_45_C.jpg,0.0,0.0,0.0,False,1_null,train
3,10007,135,C,10007_135_C.jpg,0.0,0.0,0.0,False,1_null,train
4,10013,225,C,10013_225_C.jpg,0.0,0.0,0.0,False,1_null,validation


In [3]:
df_all.groupby(['train/val/test', 'ground_truth'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,ground_truth,jpg_name
0,test,1_null,1151
1,test,multiple_present,21
2,test,only_missing,29
3,test,present,287
4,train,1_null,6621
5,train,multiple_present,136
6,train,only_missing,75
7,train,present,1712
8,validation,1_null,1152
9,validation,multiple_present,14


In [4]:
ground_truth_dict = {'0_crossing' : ['only_missing', 'present', 'multiple_present'],
    '1_null' : ['1_null']
}

# List for Training

In [8]:
df_train = df_all.loc[df_all['train/val/test'] == 'train']
df_train_sample = mini_utils.sample_rows(df_train, 2200, ground_truth_dict)
df_train_sample['folder_label'].value_counts()

0_crossing -> ['only_missing', 'present', 'multiple_present']
1_null -> ['1_null']


1_null        2200
0_crossing    1923
Name: folder_label, dtype: int64

In [9]:
df_val = df_all.loc[df_all['train/val/test'] == 'validation']
df_val_sample = mini_utils.sample_rows(df_val, 400, ground_truth_dict)
df_val_sample['folder_label'].value_counts()

0_crossing -> ['only_missing', 'present', 'multiple_present']
1_null -> ['1_null']


1_null        400
0_crossing    336
Name: folder_label, dtype: int64

In [12]:
df_test = df_all.loc[df_all['train/val/test'] == 'test']
df_test_sample = mini_utils.sample_rows(df_test, 400, ground_truth_dict)
df_test_sample['folder_label'].value_counts()

0_crossing -> ['only_missing', 'present', 'multiple_present']
1_null -> ['1_null']


1_null        400
0_crossing    337
Name: folder_label, dtype: int64

In [13]:
df_transfer = pd.concat([df_train_sample, df_val_sample, df_test_sample])
print(df_transfer.shape)
df_transfer.head()

(5596, 11)


Unnamed: 0,img_id,heading,crop_num,jpg_name,present_ramp,missing_ramp,total_count,includes_both,ground_truth,train/val/test,folder_label
0,10007,225,C,10007_225_C.jpg,1.0,0.0,1.0,False,present,train,0_crossing
1,10007,315,C,10007_315_C.jpg,1.0,0.0,1.0,False,present,train,0_crossing
11,10018,135,C,10018_135_C.jpg,1.0,0.0,1.0,False,present,train,0_crossing
15,10039,135,C,10039_135_C.jpg,1.0,0.0,1.0,False,present,train,0_crossing
17,10076,315,C,10076_315_C.jpg,2.0,0.0,2.0,False,multiple_present,train,0_crossing


In [16]:
df_transfer.groupby(['train/val/test', 'folder_label'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,folder_label,jpg_name
0,test,0_crossing,337
1,test,1_null,400
2,train,0_crossing,1923
3,train,1_null,2200
4,validation,0_crossing,336
5,validation,1_null,400


In [17]:
df_transfer.to_csv('crossing-null-split-img-details.csv', index = False)

# Specify the destination Path
* jpg_name
* train/val/test
* folder_label

In [18]:
df_transfer['original_image_path'] = 'mini-crops/' + df_transfer['jpg_name']
df_transfer['new_image_path'] = 'mini-crops-labeled/crossing-null/' + df_transfer['train/val/test'] + '/' + df_transfer['folder_label'] + '/' + df_transfer['jpg_name']
show_transfer_cols = ['jpg_name', 'train/val/test', 'folder_label', 'original_image_path', 'new_image_path']
df_transfer[show_transfer_cols].head()

Unnamed: 0,jpg_name,train/val/test,folder_label,original_image_path,new_image_path
0,10007_225_C.jpg,train,0_crossing,mini-crops/10007_225_C.jpg,mini-crops-labeled/crossing-null/train/0_crossing/10007_225_C.jpg
1,10007_315_C.jpg,train,0_crossing,mini-crops/10007_315_C.jpg,mini-crops-labeled/crossing-null/train/0_crossing/10007_315_C.jpg
11,10018_135_C.jpg,train,0_crossing,mini-crops/10018_135_C.jpg,mini-crops-labeled/crossing-null/train/0_crossing/10018_135_C.jpg
15,10039_135_C.jpg,train,0_crossing,mini-crops/10039_135_C.jpg,mini-crops-labeled/crossing-null/train/0_crossing/10039_135_C.jpg
17,10076_315_C.jpg,train,0_crossing,mini-crops/10076_315_C.jpg,mini-crops-labeled/crossing-null/train/0_crossing/10076_315_C.jpg


In [19]:
df_transfer = df_transfer.reset_index(drop = True)

In [20]:
# Move files from outer folder to gsv folder
bucket_name = 'gsv-crops2'
s3 = boto3.resource('s3')

start = datetime.now()
print(start)
cannot_move_paths = []
for index, row in df_transfer.iterrows():
    if index % 500 == 0:
        print(f'Through {index} images at {datetime.now()}')
    # Copy to gsv folder
    old_name = str(row['original_image_path'])
    new_name = str(row['new_image_path'])
    copy_source = {
        'Bucket': bucket_name,
        'Key': old_name
    }
    try:
        s3.meta.client.copy(copy_source, bucket_name, new_name)
    except:
        print(f'Could not move {old_name} at {datetime.now()}')
        cannot_move_paths.append(old_name)
end = datetime.now()
print(end)

2020-04-06 20:37:43.535112
Through 0 images at 2020-04-06 20:37:43.539733
Could not move mini-crops/1195_45_C.jpg at 2020-04-06 20:37:46.416377
Could not move mini-crops/1205_135_C.jpg at 2020-04-06 20:37:47.061602
Could not move mini-crops/15629_225_C.jpg at 2020-04-06 20:37:52.084178
Could not move mini-crops/15629_135_C.jpg at 2020-04-06 20:37:52.097048
Could not move mini-crops/680_315_C.jpg at 2020-04-06 20:38:15.713695
Could not move mini-crops/680_135_C.jpg at 2020-04-06 20:38:15.728894
Through 500 images at 2020-04-06 20:38:26.356108
Could not move mini-crops/1205_135_D.jpg at 2020-04-06 20:38:27.737608
Could not move mini-crops/15619_315_D.jpg at 2020-04-06 20:38:33.728908
Could not move mini-crops/15629_315_D.jpg at 2020-04-06 20:38:33.741023
Could not move mini-crops/15629_135_D.jpg at 2020-04-06 20:38:33.752650
Could not move mini-crops/15976_315_D.jpg at 2020-04-06 20:38:33.831574
Could not move mini-crops/15976_45_D.jpg at 2020-04-06 20:38:33.843583
Could not move mini-cr

In [21]:
df_can_move = df_transfer.loc[~df_transfer['original_image_path'].isin(cannot_move_paths)]
df_can_move.to_csv('crossingnull-crops-successful-move.csv', index = False)
df_can_move.groupby(['train/val/test', 'folder_label'])['jpg_name'].count().reset_index()

Unnamed: 0,train/val/test,folder_label,jpg_name
0,test,0_crossing,337
1,test,1_null,400
2,train,0_crossing,1886
3,train,1_null,2125
4,validation,0_crossing,336
5,validation,1_null,400
