In [None]:
import os
import cv2
from src.preprocessing import make_square
from tqdm.notebook import tqdm
import random
import json

## No need to run again

In [None]:
filenames = os.listdir("data/cfp")

random.seed(42)
random.shuffle(filenames)
filenames[:10]

In [None]:
original_shuffled_map = {}

for i in range(len(filenames)):
    original_shuffled_map[filenames[i]] = f"SHUF{str(i).zfill(5)}"

with open("original_shuffled_map.json", "w") as fp:
    json.dump(original_shuffled_map,fp) 

original_shuffled_map['DEV13781.jpg']

## Run this

In [None]:
CUTOFF_THRESHOLD = 10
OUT_PATH = f'data/shuffled_square_75'

In [None]:
with open('original_shuffled_map.json') as fp:
    original_shuffled_map = json.load(fp)

original_shuffled_map['DEV13781.jpg']

In [None]:
if not os.path.isdir(OUT_PATH):
    print(f'{OUT_PATH} does not exist, creating dir')
    os.mkdir(OUT_PATH)

In [None]:
from multiprocessing import Pool

def _make_shuffled_square_img(filename):
    try:
        file = f"data/cfp/{filename}"
        img = cv2.imread(file)
        square_img, cutting, padding = make_square(img, CUTOFF_THRESHOLD)
        new_name = original_shuffled_map[filename]
        cv2.imwrite(f"{OUT_PATH}/{new_name}.jpg", square_img, [int(cv2.IMWRITE_JPEG_QUALITY), 75])
        return (filename, new_name, cutting, padding, square_img.shape[0], img.shape)

    except Exception as e:
        print(filename, e)
        
        
l_files = os.listdir("data/cfp")

In [None]:
if True:
    with Pool() as pool:
        op_metadata = list(tqdm(pool.imap(_make_shuffled_square_img, l_files), total=len(l_files)))
else:
    op_metadata = []
    for filename in tqdm(l_files):
        r = _make_shuffled_square_img(filename)
        op_metadata.append(r)

print('Finished.')

In [None]:
import pandas as pd

df_img_info = pd.DataFrame.from_records([{
        'orig_file' : opdata[0],
        'new_file' : opdata[1],
        'delta_x' : opdata[3][0] - opdata[2][0],
        'delta_y' : opdata[3][2] - opdata[2][2],
        'orig_crop_side' : opdata[4],
        'side' : opdata[4],
        'scaling' : 1.0,
    } for opdata in op_metadata if opdata is not None])

df_img_info.sort_values(by='new_file', ascending=True).to_csv(os.path.join(OUT_PATH, 'img_info.csv'), index=False)
df_img_info