In [10]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

DATASET = "coco"
FINAL_DATA_NAME = 'coco-indoor-wavelet'
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "coco-cropped-Indoor"
SAVE_DF = False

data_dir = os.path.join(ROOT_DIR, 'sandbox', 'raw-data','coco')
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(data_dir)
data_dir
BATCH_NUM = None

In [2]:
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))
freq_df = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", f"master-frequency-map.csv"), index_col= ["dataset", "transform", "group"])


In [3]:
file_names = [
    filename
    for filename in os.listdir(os.path.join(data_dir, f"toy-{RAW_DATA_SUFFIX}"))
    if not filename.startswith(".")
]

file_list = [
    os.path.join(data_dir, f"toy-{RAW_DATA_SUFFIX}", filename)
    for filename in file_names
]

file_names[:5]


['000000001296.jpg',
 '000000000139.jpg',
 '000000000632.jpg',
 '000000000802.jpg',
 '000000001993.jpg']

In [4]:
'''Assuming No batching is required. Not applicable for agriVision'''

# data_dir = os.path.join(ROOT_DIR, "raw-data", "agriVision", "full-agriVision-RGB-cleaned")

# for channel in ['red', 'blue', 'green', 'gray', 'infrared']:

#     channel_fr = convert_to_fourier_basis(data_dir, channel, debug = True)
#     pd.to_pickle(channel_fr, os.path.join(ROOT_DIR, "transformed-data", f"full-agriVision-fourier-{channel}-df.pickle"))

#     min_group, max_group = 2, max(channel_fr['band'])
#     group_data_map = dict()
#     group_data_map_size = dict()
#     for group in np.arange(min_group, max_group + 1):
#         data = channel_fr[(channel_fr['band'] == group)]['data'].iloc[0]
#         group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)] 
#         group_data_map_size[group] = data.size
    
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}.pickle"))
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}-size.pickle"))
    

'Assuming No batching is required. Not applicable for agriVision'

In [5]:
'''To split large dataset into many batches, only needs to be run once'''
# k = 10000
# target_dir = os.path.join(ROOT_DIR, 'raw-data', 'agriVision') # Where the batch{i} folders will be created
# directorySplit(folder_dir = data_dir, target_dir = target_dir, name = RAW_DATA_SUFFIX, k = k)
# print(f"{len(file_names)//k} batches created" )

'To split large dataset into many batches, only needs to be run once'

In [6]:
'''Show all subsets of data in raw data folder that have already been created'''
print(''.join([x+"\n" for x in os.listdir(data_dir) if x.__contains__(RAW_DATA_SUFFIX)]))


toy-coco-cropped-Indoor
._toy-coco-cropped-Indoor



In [7]:
import numpy as np

def npz_opener_pickle(path):
    image = Image.open(path).convert('RGB')
    arr = np.array(image).astype(np.float32)
    jitter = np.random.uniform(-0.5, 0.5, arr.shape)
    arr += jitter
    arr = np.clip(arr, 0, 255)
    return arr

# Wavelet

In [8]:
FINAL_DATA_NAME = 'coco-indoor-wavelet'
if BATCH_NUM is None:
    batch_dir = os.path.join(ROOT_DIR, "sandbox", "raw-data", "coco", f"toy-{RAW_DATA_SUFFIX}")
else:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"batch{BATCH_NUM}-{RAW_DATA_SUFFIX}")

In [None]:
import gc

TRANSFORM = "wavelet"
channel = "red"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

0

In [None]:
import gc

TRANSFORM = "wavelet"
channel = "green"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

0

In [None]:
TRANSFORM = "wavelet"
channel = "blue"

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-18656.088, -13917.058, -13473.333, ...,  18541.072,  18542.385,
        19473.973], shape=(1604,), dtype=float32), np.int64(3): array([-11489.305, -10831.345, -10765.515, ...,  10922.157,  11048.591,
        11585.475], shape=(6416,), dtype=float32), np.int64(4): array([-6686.0264, -6631.2876, -6623.013 , ...,  6230.6436,  7057.608 ,
        7155.0044], shape=(25664,), dtype=float32), np.int64(5): array([-3415.0684, -3306.3215, -3272.6975, ...,  3212.463 ,  3222.9172,
        3244.3333], shape=(100000,), dtype=float32), np.int64(6): array([-1752.1951, -1712.0693, -1647.8092, ...,  1586.6711,  1642.568 ,
        1718.9742], shape=(100000,), dtype=float32), np.int64(7): array([-904.8463 , -814.9612 , -786.8406 , ...,  777.67   ,  811.8638 ,
        889.41144], shape=(100000,), dtype=float32), np.int64(8): array([-468.88113, -414.365  , -398.99982, ...,  392.37805,  408.50107,
        484.63342], shape=(100000,), dtype=float32), np.int64(9): array([-250.79

  0%|          | 0/1604 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-18437.377, -15845.146, -15433.076, ...,  15697.648,  16373.958,
        19876.428], shape=(1604,), dtype=float32), np.int64(3): array([-12940.436, -12440.522, -11031.544, ...,   9259.625,   9426.131,
         9618.772], shape=(6416,), dtype=float32), np.int64(4): array([-6277.2563, -5966.409 , -5947.389 , ...,  5699.646 ,  6210.4897,
        6453.772 ], shape=(25664,), dtype=float32), np.int64(5): array([-3379.3005, -3200.5396, -3187.8923, ...,  3353.6882,  3424.1716,
        3624.729 ], shape=(100000,), dtype=float32), np.int64(6): array([-1765.1881, -1690.8456, -1643.9956, ...,  1623.593 ,  1659.1816,
        1802.0055], shape=(100000,), dtype=float32), np.int64(7): array([-896.9412 , -812.8974 , -785.7233 , ...,  784.42816,  808.7185 ,
        917.4385 ], shape=(100000,), dtype=float32), np.int64(8): array([-461.11572, -411.02817, -393.9123 , ...,  396.0226 ,  411.9547 ,
        472.73605], shape=(100000,), dtype=float32), np.int64(9): array([-252.16

In [None]:
TRANSFORM = "wavelet"
channel = "gray"

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-18046.613, -15119.559, -13620.447, ...,  17981.896,  18238.174,
        20481.959], shape=(1604,), dtype=float32), np.int64(3): array([-10890.692 ,  -9433.987 ,  -9411.006 , ...,   9435.8125,
         9521.042 ,  11994.982 ], shape=(6416,), dtype=float32), np.int64(4): array([-6685.554 , -6635.5825, -6432.157 , ...,  6132.727 ,  6484.183 ,
        6842.812 ], shape=(25664,), dtype=float32), np.int64(5): array([-3341.0396, -3243.3457, -3209.3042, ...,  3124.2651,  3183.786 ,
        3244.1199], shape=(100000,), dtype=float32), np.int64(6): array([-1725.7256, -1684.8843, -1659.4983, ...,  1560.9254,  1632.7987,
        1700.2706], shape=(100000,), dtype=float32), np.int64(7): array([-906.2212 , -805.41956, -782.49506, ...,  797.06744,  815.0722 ,
        869.3485 ], shape=(100000,), dtype=float32), np.int64(8): array([-468.23703, -414.70465, -400.29276, ...,  394.684  ,  408.23486,
        465.38712], shape=(100000,), dtype=float32), np.int64(9): array([-

  0%|          | 0/1604 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-20034.842, -18218.154, -14094.517, ...,  14483.253,  17928.504,
        20527.865], shape=(1604,), dtype=float32), np.int64(3): array([-10809.566, -10508.74 , -10174.616, ...,   9099.924,   9140.568,
         9511.425], shape=(6416,), dtype=float32), np.int64(4): array([-5983.3457, -5945.2744, -5802.392 , ...,  5222.1807,  6135.6943,
        6481.74  ], shape=(25664,), dtype=float32), np.int64(5): array([-3350.8206, -3325.016 , -3287.9976, ...,  2848.9258,  2931.7576,
        2964.2046], shape=(100000,), dtype=float32), np.int64(6): array([-1739.7202, -1615.5826, -1606.6655, ...,  1571.6953,  1660.6096,
        1747.1045], shape=(100000,), dtype=float32), np.int64(7): array([-895.99713, -804.96387, -779.21564, ...,  771.47577,  800.4011 ,
        971.67914], shape=(100000,), dtype=float32), np.int64(8): array([-457.8002 , -411.50778, -395.4494 , ...,  395.5403 ,  408.8486 ,
        480.00577], shape=(100000,), dtype=float32), np.int64(9): array([-251.90