In [1]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

DATASET = "coco"
FINAL_DATA_NAME = 'coco-outdoor-wavelet'
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "coco-outdoor-cropped-normalized"
SAVE_DF = False

data_dir = os.path.join(ROOT_DIR, 'raw-data','coco')
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(data_dir)
data_dir
BATCH_NUM = None

In [2]:
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))
freq_df = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", f"master-frequency-map.csv"), index_col= ["dataset", "transform", "group"])


In [3]:
file_names = [
    filename
    for filename in os.listdir(os.path.join(data_dir, f"{RAW_DATA_SUFFIX}"))
    if not filename.startswith(".")
]

file_list = [
    os.path.join(data_dir, f"{RAW_DATA_SUFFIX}", filename)
    for filename in file_names
]

file_names[:5]


['outdoor_000000000285.npz',
 'outdoor_000000000724.npz',
 'outdoor_000000000785.npz',
 'outdoor_000000000872.npz',
 'outdoor_000000000885.npz']

In [4]:
'''Assuming No batching is required. Not applicable for agriVision'''

# data_dir = os.path.join(ROOT_DIR, "raw-data", "agriVision", "full-agriVision-RGB-cleaned")

# for channel in ['red', 'blue', 'green', 'gray', 'infrared']:

#     channel_fr = convert_to_fourier_basis(data_dir, channel, debug = True)
#     pd.to_pickle(channel_fr, os.path.join(ROOT_DIR, "transformed-data", f"full-agriVision-fourier-{channel}-df.pickle"))

#     min_group, max_group = 2, max(channel_fr['band'])
#     group_data_map = dict()
#     group_data_map_size = dict()
#     for group in np.arange(min_group, max_group + 1):
#         data = channel_fr[(channel_fr['band'] == group)]['data'].iloc[0]
#         group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)] 
#         group_data_map_size[group] = data.size
    
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}.pickle"))
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}-size.pickle"))
    

'Assuming No batching is required. Not applicable for agriVision'

In [5]:
'''To split large dataset into many batches, only needs to be run once'''
# k = 10000
# target_dir = os.path.join(ROOT_DIR, 'raw-data', 'agriVision') # Where the batch{i} folders will be created
# directorySplit(folder_dir = data_dir, target_dir = target_dir, name = RAW_DATA_SUFFIX, k = k)
# print(f"{len(file_names)//k} batches created" )

'To split large dataset into many batches, only needs to be run once'

In [6]:
'''Show all subsets of data in raw data folder that have already been created'''
print(''.join([x+"\n" for x in os.listdir(data_dir) if x.__contains__(RAW_DATA_SUFFIX)]))


coco-outdoor-cropped-normalized



In [7]:
import numpy as np

def npz_opener_pickle(path):
    # Load the .npz file
    with np.load(path) as data:
        arr = data['image']  # Default key if saved without naming the array

    arr = arr.astype(np.float32)
    
    # Apply jitter
    jitter = np.random.uniform(-0.5, 0.5, arr.shape)
    arr += jitter
    arr = np.clip(arr, 0, 255)
    
    return arr


# Wavelet

In [8]:
FINAL_DATA_NAME = 'coco-outdoor-wavelet'
if BATCH_NUM is None:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"{RAW_DATA_SUFFIX}")
else:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"batch{BATCH_NUM}-{RAW_DATA_SUFFIX}")

In [9]:
import gc

TRANSFORM = "wavelet"
channel = "red"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-outdoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

0

In [10]:
import gc

TRANSFORM = "wavelet"
channel = "green"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-outdoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

0

In [11]:
TRANSFORM = "wavelet"
channel = "blue"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-outdoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-422.43558, -296.65897, -284.92206, ...,  336.5459 ,  354.8944 ,
        398.1622 ], dtype=float32), np.int64(3): array([-210.37883, -175.50592, -175.06528, ...,  191.41302,  194.44191,
        199.10428], dtype=float32), np.int64(4): array([-111.24049, -103.54764, -102.33232, ...,  106.91356,  107.54587,
        111.87244], dtype=float32), np.int64(5): array([-54.95793 , -54.50914 , -53.489067, ...,  54.204308,  56.107887,
        57.162865], dtype=float32), np.int64(6): array([-29.474308, -28.347984, -27.541843, ...,  27.370152,  28.186024,
        30.114313], dtype=float32), np.int64(7): array([-15.403011, -14.298624, -14.034199, ...,  14.887789,  15.069923,
        15.51539 ], dtype=float32), np.int64(8): array([-8.070071 , -7.4080553, -7.2497315, ...,  7.1290865,  7.2992315,
        7.824546 ], dtype=float32), np.int64(9): array([-4.3054457, -4.0546207, -3.9765482, ...,  3.9377124,  4.0202265,
        4.3197994], dtype=float32)}
9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-292.85895, -218.22467, -214.47041, ...,  265.03024,  286.28748,
        288.63464], dtype=float32), np.int64(3): array([-165.75829, -162.91586, -157.63864, ...,  165.98592,  170.98074,
        184.14436], dtype=float32), np.int64(4): array([-102.07339,  -97.34307,  -97.15141, ...,   93.38152,   94.39914,
         95.41106], dtype=float32), np.int64(5): array([-57.293587, -53.673096, -52.482975, ...,  52.198414,  56.077602,
        58.741753], dtype=float32), np.int64(6): array([-28.227037, -27.498676, -26.865114, ...,  26.55831 ,  27.460215,
        28.40338 ], dtype=float32), np.int64(7): array([-15.051987, -14.184348, -13.789672, ...,  13.853134,  14.150279,
        15.24598 ], dtype=float32), np.int64(8): array([-7.9758153, -7.3700895, -7.191359 , ...,  7.085859 ,  7.274251 ,
        7.9877253], dtype=float32), np.int64(9): array([-4.324072 , -4.0582585, -3.9774332, ...,  3.9645524,  4.04367  ,
        4.3177786], dtype=float32)}
9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-210.16107, -169.18207, -156.10913, ...,  154.16127,  165.36575,
        167.46085], dtype=float32), np.int64(3): array([-96.7817  , -88.30004 , -85.82272 , ...,  91.648415,  93.03665 ,
       100.59108 ], dtype=float32), np.int64(4): array([-62.194435, -56.748814, -55.60083 , ...,  57.402298,  57.422585,
        58.65851 ], dtype=float32), np.int64(5): array([-31.845484, -30.097033, -29.282936, ...,  29.409954,  31.86261 ,
        32.173138], dtype=float32), np.int64(6): array([-18.102413, -16.702972, -15.706913, ...,  15.257603,  15.717578,
        17.326885], dtype=float32), np.int64(7): array([-9.841684 , -8.304268 , -7.9268475, ...,  7.6713943,  8.2335205,
        9.957454 ], dtype=float32), np.int64(8): array([-5.666997 , -4.7185035, -4.4592824, ...,  4.4285717,  4.6269674,
        5.5622253], dtype=float32), np.int64(9): array([-4.2041283, -3.2990875, -3.1167479, ...,  3.130167 ,  3.3436477,
        4.162753 ], dtype=float32)}


In [12]:
TRANSFORM = "wavelet"
channel = "gray"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-outdoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-415.14264, -320.7126 , -304.59628, ...,  350.46967,  357.34518,
        387.3649 ], dtype=float32), np.int64(3): array([-208.8021 , -157.6937 , -156.72385, ...,  194.46358,  197.00255,
        205.25179], dtype=float32), np.int64(4): array([-100.74018,  -90.40927,  -88.71211, ...,  104.64858,  107.72949,
        108.12534], dtype=float32), np.int64(5): array([-52.56305 , -51.017727, -49.807095, ...,  53.81147 ,  54.65492 ,
        54.657352], dtype=float32), np.int64(6): array([-28.531221, -27.89729 , -27.05191 , ...,  26.50343 ,  26.830542,
        28.871508], dtype=float32), np.int64(7): array([-14.552585, -13.902477, -13.49639 , ...,  14.317672,  14.796275,
        15.029884], dtype=float32), np.int64(8): array([-7.595953 , -7.101404 , -6.96275  , ...,  6.8406796,  7.0135164,
        7.4805017], dtype=float32), np.int64(9): array([-4.074268 , -3.801346 , -3.7373464, ...,  3.7251546,  3.7947822,
        4.1058774], dtype=float32)}
9 layers being used


  0%|          | 0/2006 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-288.72623, -251.18277, -243.74817, ...,  246.75299,  264.31674,
        328.78088], dtype=float32), np.int64(3): array([-176.78624, -166.6656 , -156.78073, ...,  174.71056,  177.87167,
        182.39937], dtype=float32), np.int64(4): array([-99.25209 , -97.07921 , -92.95105 , ...,  91.16574 ,  94.126564,
        94.944786], dtype=float32), np.int64(5): array([-54.929985, -51.6843  , -50.743706, ...,  52.3684  ,  54.32162 ,
        56.154957], dtype=float32), np.int64(6): array([-27.281096, -26.798365, -26.552948, ...,  25.425812,  26.295454,
        27.096546], dtype=float32), np.int64(7): array([-14.6117935, -13.536695 , -13.240603 , ...,  13.250528 ,
        13.717231 ,  14.624173 ], dtype=float32), np.int64(8): array([-7.5894923, -7.095876 , -6.9061656, ...,  6.84876  ,  7.021058 ,
        7.5134   ], dtype=float32), np.int64(9): array([-4.088404 , -3.816437 , -3.757285 , ...,  3.734187 ,  3.798826 ,
        4.0970984], dtype=float32)}
9 layers being

  0%|          | 0/2006 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-196.70172, -153.94421, -139.84595, ...,  141.78932,  154.55484,
        157.39832], dtype=float32), np.int64(3): array([-98.546776, -82.49111 , -81.20995 , ...,  74.220924,  78.78026 ,
        80.984474], dtype=float32), np.int64(4): array([-59.911884, -46.01485 , -45.02544 , ...,  50.858856,  52.777786,
        60.17029 ], dtype=float32), np.int64(5): array([-35.781895, -30.53305 , -25.9118  , ...,  26.547308,  28.786488,
        28.87988 ], dtype=float32), np.int64(6): array([-16.799593 , -15.157442 , -14.0464525, ...,  14.892853 ,
        15.595773 ,  17.145672 ], dtype=float32), np.int64(7): array([-9.325859 , -7.8124566, -7.3611956, ...,  7.274867 ,  7.756201 ,
        9.4278965], dtype=float32), np.int64(8): array([-5.2367806, -4.423679 , -4.2083635, ...,  4.195293 ,  4.408547 ,
        5.2995944], dtype=float32), np.int64(9): array([-3.8839817, -3.093039 , -2.9087927, ...,  2.9530442,  3.136388 ,
        3.926236 ], dtype=float32)}
