In [23]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

DATASET = "standardTesting"
FINAL_DATA_NAME = 'standardTesting-full-wavelet'
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "standardTesting-full"
SAVE_DF = False

data_dir = os.path.join(ROOT_DIR, 'raw-data',DATASET)
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(data_dir)
data_dir
BATCH_NUM = None

In [24]:
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))
freq_df = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", f"master-frequency-map.csv"), index_col= ["dataset", "transform", "group"])


In [25]:
file_names = [
    filename
    for filename in os.listdir(os.path.join(data_dir, f"{RAW_DATA_SUFFIX}"))
    if not filename.startswith(".")
]

file_list = [
    os.path.join(data_dir, f"{RAW_DATA_SUFFIX}", filename)
    for filename in file_names
]

file_names[:5]


['jetplane.tif',
 'house.tif',
 'cameraman.tif',
 'pirate.tif',
 'peppers_gray.tif']

In [26]:
'''To split large dataset into many batches, only needs to be run once'''
# k = 10000
# target_dir = os.path.join(ROOT_DIR, 'raw-data', 'agriVision') # Where the batch{i} folders will be created
# directorySplit(folder_dir = data_dir, target_dir = target_dir, name = RAW_DATA_SUFFIX, k = k)
# print(f"{len(file_names)//k} batches created" )

'To split large dataset into many batches, only needs to be run once'

In [27]:
'''Show all subsets of data in raw data folder that have already been created'''
print(''.join([x+"\n" for x in os.listdir(data_dir) if x.__contains__(RAW_DATA_SUFFIX)]))


standardTesting-full-barbara
standardTesting-full



In [28]:
import numpy as np
import cv2
np.random.seed(42)  
def tiff_opener(path):
    
    # Apply jitter
    arr = np.array(cv2.imread(path)).astype(np.float64)
    jitter = np.random.uniform(-0.5, 0.5, arr.shape)
    arr += jitter
    arr = arr - np.mean(arr)
    arr = arr / np.std(arr)

    
    return arr


# Wavelet

In [29]:
FINAL_DATA_NAME = 'standardTesting-full-wavelet'
if BATCH_NUM is None:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", DATASET, f"{RAW_DATA_SUFFIX}")
else:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", DATASET, f"batch{BATCH_NUM}-{RAW_DATA_SUFFIX}")

In [30]:
TRANSFORM = "wavelet"
channel = "gray"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=tiff_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-outdoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


10 layers being used


  0%|          | 0/9 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-33.9662  ,  48.336037,  65.68538 ,  68.77648 , 103.016335,
       124.74218 , 150.18173 , 160.42543 , 201.2891  ], dtype=float32), np.int64(3): array([-132.7916   , -123.289406 ,  -84.92582  ,  -80.37053  ,
        -71.754524 ,  -66.41612  ,  -66.17113  ,  -58.66343  ,
        -54.996128 ,  -43.389317 ,  -42.406918 ,  -33.893204 ,
        -31.785091 ,  -30.034956 ,  -22.183554 ,  -17.903448 ,
        -17.78679  ,  -12.356176 ,   -1.6964386,   -0.8541296,
          3.5976214,    4.317175 ,    8.269512 ,   13.277905 ,
         16.737595 ,   23.757196 ,   33.111347 ,   35.319996 ,
         36.845028 ,   44.542156 ,   63.852352 ,   72.802925 ,
         75.450096 ,   82.0963   ,  167.4028   ,  201.21217  ],
      dtype=float32), np.int64(4): array([-8.36305313e+01, -7.68041611e+01, -6.57795334e+01, -6.43960342e+01,
       -6.43154907e+01, -6.13138275e+01, -5.65682793e+01, -5.54977188e+01,
       -5.47841568e+01, -4.69635620e+01, -4.66350937e+01, -4.55316391e

  0%|          | 0/9 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-172.41353  , -155.3703   ,  -82.62119  ,  -50.704956 ,
        -23.496094 ,   -6.5191536,   -4.3832026,   85.85616  ,
        206.21333  ], dtype=float32), np.int64(3): array([-201.72472 ,  -78.722694,  -69.15314 ,  -62.59758 ,  -58.00989 ,
        -51.02484 ,  -43.80495 ,  -43.16307 ,  -32.074657,  -30.206495,
        -28.58929 ,  -25.284489,  -22.169146,  -14.292671,  -13.85688 ,
        -12.941895,   -2.959414,   11.905325,   13.446888,   21.440649,
         26.414818,   29.908503,   48.828697,   52.51713 ,   53.222034,
         54.501465,   59.51164 ,   64.46367 ,   73.00539 ,   79.40249 ,
         80.0892  ,   83.51529 ,  100.73119 ,  109.66394 ,  145.0629  ,
        164.0631  ], dtype=float32), np.int64(4): array([-9.00240784e+01, -7.96827698e+01, -7.36621552e+01, -6.75479202e+01,
       -6.66438446e+01, -6.22730484e+01, -5.99551201e+01, -5.66631012e+01,
       -5.40363426e+01, -4.30263786e+01, -4.28015404e+01, -4.17501793e+01,
       -4.08265648e

  0%|          | 0/9 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-73.63902 , -72.512794, -49.01686 , -44.58629 , -24.41544 ,
       -17.321236, -15.666371,  12.909259,  32.954193], dtype=float32), np.int64(3): array([-106.83552  ,  -96.50274  ,  -87.95242  ,  -81.47705  ,
        -78.55877  ,  -70.69959  ,  -45.4146   ,  -32.802902 ,
        -31.243631 ,  -30.940664 ,  -17.811537 ,  -17.31129  ,
        -11.7450695,  -11.130182 ,   -8.873164 ,   -6.643087 ,
         -5.570327 ,   -2.6020575,    4.3112297,    6.9840307,
         10.105682 ,   10.280994 ,   11.741862 ,   14.1189375,
         16.524164 ,   22.895872 ,   24.33784  ,   28.178432 ,
         32.141327 ,   36.813015 ,   44.456375 ,   44.890762 ,
         53.16552  ,   55.375473 ,   70.199646 ,  124.10091  ],
      dtype=float32), np.int64(4): array([-5.86203690e+01, -5.32288170e+01, -5.13546219e+01, -5.03352928e+01,
       -4.45057068e+01, -4.04889221e+01, -3.58514290e+01, -3.48390999e+01,
       -3.46843681e+01, -3.45852852e+01, -3.35098419e+01, -3.08029060e