In [1]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

DATASET = "coco"
FINAL_DATA_NAME = 'coco-indoor-wavelet'
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "coco-indoor-cropped-normalized"
SAVE_DF = False

data_dir = os.path.join(ROOT_DIR, 'raw-data','coco')
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(data_dir)
data_dir
BATCH_NUM = None



In [2]:
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))
freq_df = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", f"master-frequency-map.csv"), index_col= ["dataset", "transform", "group"])


In [3]:
file_names = [
    filename
    for filename in os.listdir(os.path.join(data_dir, f"{RAW_DATA_SUFFIX}"))
    if not filename.startswith(".")
]

file_list = [
    os.path.join(data_dir, f"{RAW_DATA_SUFFIX}", filename)
    for filename in file_names
]

file_names[:5]


['indoor_000000000139.npz',
 'indoor_000000000632.npz',
 'indoor_000000000776.npz',
 'indoor_000000000802.npz',
 'indoor_000000001296.npz']

In [4]:
'''Assuming No batching is required. Not applicable for agriVision'''

# data_dir = os.path.join(ROOT_DIR, "raw-data", "agriVision", "full-agriVision-RGB-cleaned")

# for channel in ['red', 'blue', 'green', 'gray', 'infrared']:

#     channel_fr = convert_to_fourier_basis(data_dir, channel, debug = True)
#     pd.to_pickle(channel_fr, os.path.join(ROOT_DIR, "transformed-data", f"full-agriVision-fourier-{channel}-df.pickle"))

#     min_group, max_group = 2, max(channel_fr['band'])
#     group_data_map = dict()
#     group_data_map_size = dict()
#     for group in np.arange(min_group, max_group + 1):
#         data = channel_fr[(channel_fr['band'] == group)]['data'].iloc[0]
#         group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)] 
#         group_data_map_size[group] = data.size
    
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}.pickle"))
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}-size.pickle"))
    

'Assuming No batching is required. Not applicable for agriVision'

In [5]:
'''To split large dataset into many batches, only needs to be run once'''
# k = 10000
# target_dir = os.path.join(ROOT_DIR, 'raw-data', 'agriVision') # Where the batch{i} folders will be created
# directorySplit(folder_dir = data_dir, target_dir = target_dir, name = RAW_DATA_SUFFIX, k = k)
# print(f"{len(file_names)//k} batches created" )

'To split large dataset into many batches, only needs to be run once'

In [6]:
'''Show all subsets of data in raw data folder that have already been created'''
print(''.join([x+"\n" for x in os.listdir(data_dir) if x.__contains__(RAW_DATA_SUFFIX)]))


coco-indoor-cropped-normalized



In [7]:
import numpy as np

def npz_opener_pickle(path):
    # Load the .npz file
    with np.load(path) as data:
        arr = data['image']  # Default key if saved without naming the array

    arr = arr.astype(np.float32)
    
    # Apply jitter
    jitter = np.random.uniform(-0.5, 0.5, arr.shape)
    arr += jitter
    arr = np.clip(arr, 0, 255)
    
    return arr


# Wavelet

In [8]:
FINAL_DATA_NAME = 'coco-indoor-wavelet'
if BATCH_NUM is None:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"{RAW_DATA_SUFFIX}")
else:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"batch{BATCH_NUM}-{RAW_DATA_SUFFIX}")

In [9]:
import gc

TRANSFORM = "wavelet"
channel = "red"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

: 

In [10]:
import gc

TRANSFORM = "wavelet"
channel = "green"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

0

In [11]:
TRANSFORM = "wavelet"
channel = "blue"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-251.82898, -228.83157, -226.35397, ...,  227.44359,  230.37117,
        319.68768], dtype=float32), np.int64(3): array([-159.69775, -150.33289, -144.18079, ...,  162.3812 ,  167.34256,
        200.5624 ], dtype=float32), np.int64(4): array([-101.61334,  -96.66931,  -93.65338, ...,   94.35974,   94.73074,
        100.94613], dtype=float32), np.int64(5): array([-49.256405, -48.917606, -48.912422, ...,  46.403366,  46.682564,
        46.89617 ], dtype=float32), np.int64(6): array([-26.573685, -25.482971, -24.83669 , ...,  24.831057,  25.2959  ,
        26.291546], dtype=float32), np.int64(7): array([-13.683502, -12.996016, -12.564633, ...,  12.405188,  12.694693,
        13.530101], dtype=float32), np.int64(8): array([-7.227001 , -6.669417 , -6.493014 , ...,  6.4137053,  6.571144 ,
        7.1398487], dtype=float32), np.int64(9): array([-3.8906963, -3.616777 , -3.533586 , ...,  3.526249 ,  3.6064172,
        3.896512 ], dtype=float32)}
9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-287.62152, -262.60492, -256.65277, ...,  277.90027,  288.77338,
        303.28226], dtype=float32), np.int64(3): array([-154.17052, -153.50735, -150.00531, ...,  145.39716,  154.25925,
        155.35533], dtype=float32), np.int64(4): array([-95.79519 , -93.29342 , -88.4875  , ...,  89.495415,  91.6702  ,
        92.66308 ], dtype=float32), np.int64(5): array([-51.94559 , -49.654755, -49.446083, ...,  47.89184 ,  49.26031 ,
        49.470646], dtype=float32), np.int64(6): array([-27.17032 , -25.409565, -24.804907, ...,  24.18994 ,  24.997122,
        26.313366], dtype=float32), np.int64(7): array([-13.2053795, -12.598123 , -12.19962  , ...,  12.404572 ,
        12.721636 ,  13.503652 ], dtype=float32), np.int64(8): array([-7.3426266, -6.565537 , -6.4021273, ...,  6.4031553,  6.578902 ,
        7.1682835], dtype=float32), np.int64(9): array([-3.8558877, -3.5851855, -3.5078993, ...,  3.4832351,  3.5825667,
        3.8980029], dtype=float32)}
9 layers being

  0%|          | 0/1604 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-184.46024, -158.34308, -140.92801, ...,  138.97397,  146.44958,
        149.96622], dtype=float32), np.int64(3): array([-112.76155 , -101.693726,  -88.605385, ...,   81.93246 ,
         88.39295 ,   92.710724], dtype=float32), np.int64(4): array([-53.570312, -52.00907 , -49.8506  , ...,  44.713432,  49.031773,
        49.129326], dtype=float32), np.int64(5): array([-28.592522, -26.437925, -26.01212 , ...,  27.82864 ,  29.973656,
        36.32309 ], dtype=float32), np.int64(6): array([-15.536121, -14.815262, -13.300306, ...,  12.785673,  13.723051,
        15.125103], dtype=float32), np.int64(7): array([-8.513971 , -6.9113674, -6.5949607, ...,  6.317406 ,  6.8700237,
        8.172414 ], dtype=float32), np.int64(8): array([-5.566472 , -4.0937486, -3.8067608, ...,  3.7603827,  3.9599862,
        5.5112076], dtype=float32), np.int64(9): array([-3.7129433, -2.6747708, -2.4762883, ...,  2.5015645,  2.6860256,
        3.6789432], dtype=float32)}


In [12]:
TRANSFORM = "wavelet"
channel = "gray"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener_pickle)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-304.87573, -282.91327, -268.65845, ...,  286.0272 ,  290.2645 ,
        378.17932], dtype=float32), np.int64(3): array([-204.46742, -179.72356, -175.6449 , ...,  194.51266,  203.67924,
        235.22299], dtype=float32), np.int64(4): array([-120.30176 , -111.84119 , -106.71191 , ...,  116.12464 ,
        117.05783 ,  119.482956], dtype=float32), np.int64(5): array([-58.908924, -58.043015, -57.81529 , ...,  53.71816 ,  53.75927 ,
        55.994316], dtype=float32), np.int64(6): array([-30.558205, -30.059416, -29.381016, ...,  29.107008,  29.729864,
        31.23067 ], dtype=float32), np.int64(7): array([-15.604708, -14.68929 , -14.320933, ...,  14.355756,  14.744203,
        15.265714], dtype=float32), np.int64(8): array([-8.098208 , -7.5935025, -7.4320602, ...,  7.3458686,  7.583283 ,
        8.257872 ], dtype=float32), np.int64(9): array([-4.3100977, -4.019562 , -3.935918 , ...,  3.9213746,  4.0075493,
        4.4201946], dtype=float32)}
9 layers being

  0%|          | 0/1604 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-313.286  , -279.3526 , -274.76392, ...,  281.72278,  315.69327,
        323.6844 ], dtype=float32), np.int64(3): array([-215.61388, -187.73459, -177.39432, ...,  170.73833,  177.18047,
        177.59546], dtype=float32), np.int64(4): array([-113.5234 , -113.09408, -111.69312, ...,  108.30007,  112.46825,
        116.31825], dtype=float32), np.int64(5): array([-60.599964, -60.264446, -60.160618, ...,  56.843315,  58.423985,
        58.523373], dtype=float32), np.int64(6): array([-31.809727, -29.70343 , -29.257376, ...,  29.002195,  29.278301,
        31.501616], dtype=float32), np.int64(7): array([-15.769814, -15.026432, -14.585868, ...,  14.237423,  14.676295,
        15.78852 ], dtype=float32), np.int64(8): array([-8.061981 , -7.564883 , -7.3301725, ...,  7.4161124,  7.6157727,
        8.154937 ], dtype=float32), np.int64(9): array([-4.4229026, -4.0197654, -3.935263 , ...,  3.897871 ,  3.9935   ,
        4.3497276], dtype=float32)}
9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-194.80089, -170.71121, -153.52812, ...,  165.51677,  167.30078,
        207.76955], dtype=float32), np.int64(3): array([-138.83203, -105.87509, -102.42499, ...,   85.75875,   87.54909,
         91.08873], dtype=float32), np.int64(4): array([-62.78834 , -61.81336 , -60.459106, ...,  48.492683,  51.83682 ,
        58.99973 ], dtype=float32), np.int64(5): array([-32.930264, -30.863234, -28.471476, ...,  28.52134 ,  31.315874,
        31.394108], dtype=float32), np.int64(6): array([-17.703634, -15.017699, -14.841733, ...,  14.846697,  15.507093,
        16.645191], dtype=float32), np.int64(7): array([-9.670558 , -7.5887337, -7.184431 , ...,  7.109892 ,  7.694028 ,
        9.369997 ], dtype=float32), np.int64(8): array([-5.656046 , -4.532997 , -4.215484 , ...,  4.1565433,  4.4757423,
        6.00015  ], dtype=float32), np.int64(9): array([-4.016829 , -2.8986893, -2.6669817, ...,  2.6846423,  2.8904102,
        4.0925107], dtype=float32)}
