In [23]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

DATASET = "coco"
FINAL_DATA_NAME = 'coco-indoor-wavelet'
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "coco-indoor-cropped"
SAVE_DF = False

data_dir = os.path.join(ROOT_DIR, 'raw-data','coco')
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(data_dir)
data_dir
BATCH_NUM = None



In [24]:
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))
freq_df = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", f"master-frequency-map.csv"), index_col= ["dataset", "transform", "group"])


In [25]:
file_names = [
    filename
    for filename in os.listdir(os.path.join(data_dir, f"{RAW_DATA_SUFFIX}"))
    if not filename.startswith(".")
]

file_list = [
    os.path.join(data_dir, f"{RAW_DATA_SUFFIX}", filename)
    for filename in file_names
]

file_names[:5]


['000000182611.jpg',
 '000000479126.jpg',
 '000000304396.jpg',
 '000000231339.jpg',
 '000000377393.jpg']

In [26]:
'''Assuming No batching is required. Not applicable for agriVision'''

# data_dir = os.path.join(ROOT_DIR, "raw-data", "agriVision", "full-agriVision-RGB-cleaned")

# for channel in ['red', 'blue', 'green', 'gray', 'infrared']:

#     channel_fr = convert_to_fourier_basis(data_dir, channel, debug = True)
#     pd.to_pickle(channel_fr, os.path.join(ROOT_DIR, "transformed-data", f"full-agriVision-fourier-{channel}-df.pickle"))

#     min_group, max_group = 2, max(channel_fr['band'])
#     group_data_map = dict()
#     group_data_map_size = dict()
#     for group in np.arange(min_group, max_group + 1):
#         data = channel_fr[(channel_fr['band'] == group)]['data'].iloc[0]
#         group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)] 
#         group_data_map_size[group] = data.size
    
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}.pickle"))
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}-size.pickle"))
    

'Assuming No batching is required. Not applicable for agriVision'

In [27]:
'''To split large dataset into many batches, only needs to be run once'''
# k = 10000
# target_dir = os.path.join(ROOT_DIR, 'raw-data', 'agriVision') # Where the batch{i} folders will be created
# directorySplit(folder_dir = data_dir, target_dir = target_dir, name = RAW_DATA_SUFFIX, k = k)
# print(f"{len(file_names)//k} batches created" )

'To split large dataset into many batches, only needs to be run once'

In [28]:
'''Show all subsets of data in raw data folder that have already been created'''
print(''.join([x+"\n" for x in os.listdir(data_dir) if x.__contains__(RAW_DATA_SUFFIX)]))


coco-indoor-cropped



In [36]:
import numpy as np
np.random.seed(42)  
def jpg_opener(path):
    
    # Apply jitter
    image = np.array(Image.open(path).convert('RGB'))
    arr = image.astype(np.float64)
    jitter = np.random.uniform(-0.5, 0.5, arr.shape)
    arr += jitter
    arr = arr - np.mean(arr)
    arr = arr / np.std(arr)

    
    return arr


# Wavelet

In [37]:
FINAL_DATA_NAME = 'coco-indoor-wavelet'
if BATCH_NUM is None:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"{RAW_DATA_SUFFIX}")
else:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "coco", f"batch{BATCH_NUM}-{RAW_DATA_SUFFIX}")

In [38]:
import gc

TRANSFORM = "wavelet"
channel = "red"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=jpg_opener)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

0

In [39]:
import gc

TRANSFORM = "wavelet"
channel = "green"

channel_wv_full = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=jpg_opener)
channel_wv_full['data'] = channel_wv_full['data'].apply(lambda x: x.astype(np.float32))  # if needed, or skip if you jittered already

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = channel_wv_full[channel_wv_full['orientation'] == orientation_code].copy()

    if SAVE_DF:
        df_save_path = os.path.join(
            ROOT_DIR,
            "transformed-data",
            f"dataframes/{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"
        )
        pd.to_pickle(channel_wv, df_save_path)

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = {}
    group_data_map_size = {}

    for group in np.arange(min_group, max_group + 1):
        filtered = channel_wv[channel_wv['layer'] == group]
        if filtered.empty:
            continue

        data = filtered['data'].iloc[0]
        sampled = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]

        group_data_map[group] = sampled
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    freq_df_save_path = os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")
    freq_df.to_csv(freq_df_save_path)

    group_save_base = os.path.join(
        ROOT_DIR,
        "transformed-data",
        f"{'' if BATCH_NUM is None else f'batch{BATCH_NUM}'}{FINAL_DATA_NAME_ORIENTED}-{channel}"
    )
    pd.to_pickle(group_data_map, f"{group_save_base}.pickle")
    pd.to_pickle(group_data_map_size, f"{group_save_base}-size.pickle")

    del channel_wv
    del group_data_map
    del group_data_map_size
    gc.collect()

del channel_wv_full
gc.collect()

9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

0

In [40]:
TRANSFORM = "wavelet"
channel = "blue"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True,  image_opener=jpg_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-209.14372, -205.32481, -193.03166, ...,  232.02325,  234.02238,
        242.3119 ], dtype=float32), np.int64(3): array([-165.42207, -156.1896 , -144.4635 , ...,  154.56436,  155.71172,
        172.385  ], dtype=float32), np.int64(4): array([-111.452065, -107.44388 , -105.83926 , ...,   94.76358 ,
         96.86688 ,   97.78854 ], dtype=float32), np.int64(5): array([-118.35611 ,  -65.309845,  -64.08341 , ...,   52.69262 ,
         54.47514 ,   62.05505 ], dtype=float32), np.int64(6): array([-39.600914, -31.569727, -30.676874, ...,  26.044779,  29.158464,
        43.438374], dtype=float32), np.int64(7): array([-23.702944, -17.20051 , -15.413932, ...,  14.36641 ,  16.539337,
        25.011766], dtype=float32), np.int64(8): array([-12.406773 ,  -7.963714 ,  -7.288097 , ...,   7.1700425,
         7.766318 ,  15.295829 ], dtype=float32), np.int64(9): array([-8.357713 , -4.2758417, -3.908142 , ...,  3.874356 ,  4.279595 ,
        8.046624 ], dtype=float32)}
9 

  0%|          | 0/1604 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-218.33394, -215.0107 , -212.84485, ...,  201.65616,  207.27568,
        215.39304], dtype=float32), np.int64(3): array([-180.0537 , -158.98653, -158.16452, ...,  146.1452 ,  153.20169,
        168.32726], dtype=float32), np.int64(4): array([-102.99306,  -91.85751,  -87.70578, ...,  100.92873,  104.5578 ,
        106.58811], dtype=float32), np.int64(5): array([-129.50208 ,  -59.56608 ,  -58.537685, ...,   54.168423,
         58.979385,   70.06082 ], dtype=float32), np.int64(6): array([-58.520107, -29.56545 , -27.445852, ...,  28.582026,  30.015251,
        37.879807], dtype=float32), np.int64(7): array([-23.577524, -15.236538, -13.679176, ...,  13.424546,  14.596525,
        21.304434], dtype=float32), np.int64(8): array([-15.727743,  -7.759533,  -7.125235, ...,   7.242122,   8.122948,
        13.292139], dtype=float32), np.int64(9): array([-8.440437 , -4.201765 , -3.8236911, ...,  3.8760722,  4.2587643,
        7.7269664], dtype=float32)}
9 layers being

  0%|          | 0/1604 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-156.43665, -118.24111, -113.1887 , ...,  142.31451,  144.11356,
        155.46272], dtype=float32), np.int64(3): array([-99.79311 , -83.372055, -78.645485, ...,  84.35932 ,  86.46507 ,
       106.43826 ], dtype=float32), np.int64(4): array([-62.71122 , -55.69331 , -52.485653, ...,  51.026512,  54.72633 ,
        59.338837], dtype=float32), np.int64(5): array([-31.095133, -28.894058, -27.022274, ...,  27.416943,  28.295507,
        33.379116], dtype=float32), np.int64(6): array([-19.000494 , -15.330213 , -13.88764  , ...,  13.9801235,
        14.931963 ,  19.386864 ], dtype=float32), np.int64(7): array([-9.014478 , -7.0760965, -6.6735764, ...,  6.8456564,  7.6017165,
       13.582847 ], dtype=float32), np.int64(8): array([-7.6805754, -4.3493037, -3.9175847, ...,  3.992868 ,  4.449641 ,
        7.887759 ], dtype=float32), np.int64(9): array([-4.7546062, -2.537647 , -2.2908483, ...,  2.314405 ,  2.5833018,
        5.0077767], dtype=float32)}


In [41]:
TRANSFORM = "wavelet"
channel = "gray"

for orientation_label in ['horizontal', 'vertical', 'diagonal']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else ('V' if orientation_label == 'vertical' else 'D')

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True,  image_opener=jpg_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[f"{DATASET}-indoor", TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


9 layers being used


  0%|          | 0/1604 [00:00<?, ?it/s]

printing H
{np.int64(2): array([-206.10165, -192.92024, -186.0746 , ...,  200.80084,  215.35817,
        217.34395], dtype=float32), np.int64(3): array([-148.06543, -148.06104, -135.29985, ...,  140.23227,  143.75026,
        146.01054], dtype=float32), np.int64(4): array([-130.72935 ,  -91.3175  ,  -90.01253 , ...,   91.906975,
         93.89082 ,  104.94781 ], dtype=float32), np.int64(5): array([-78.2476  , -72.741066, -64.75424 , ...,  51.633064,  51.7586  ,
        52.63364 ], dtype=float32), np.int64(6): array([-43.232445, -31.849564, -29.387934, ...,  25.423326,  29.361069,
        46.52628 ], dtype=float32), np.int64(7): array([-23.685347 , -15.736735 , -14.66332  , ...,  13.967749 ,
        15.7094555,  23.056986 ], dtype=float32), np.int64(8): array([-11.189963 ,  -7.9330096,  -7.299925 , ...,   7.1044416,
         7.8651967,  15.002885 ], dtype=float32), np.int64(9): array([-8.355759 , -4.2710223, -3.923322 , ...,  3.9044545,  4.317322 ,
        8.037834 ], dtype=float32)}
9 

  0%|          | 0/1604 [00:00<?, ?it/s]

printing V
{np.int64(2): array([-243.15321, -216.73358, -196.74812, ...,  191.52583,  217.33353,
        222.44739], dtype=float32), np.int64(3): array([-151.15465, -138.45146, -133.64568, ...,  128.03108,  129.03917,
        137.53003], dtype=float32), np.int64(4): array([-102.97861 ,  -96.22225 ,  -92.56291 , ...,   96.078476,
        101.126015,  101.367424], dtype=float32), np.int64(5): array([-129.46701 ,  -56.241383,  -53.217384, ...,   61.65741 ,
         61.83803 ,   68.97361 ], dtype=float32), np.int64(6): array([-58.52675 , -29.792486, -26.348934, ...,  28.695293,  30.78247 ,
        35.11588 ], dtype=float32), np.int64(7): array([-23.589542, -16.829092, -14.110415, ...,  13.347393,  14.501694,
        18.151808], dtype=float32), np.int64(8): array([-15.755593 ,  -7.9046116,  -7.1506686, ...,   7.339482 ,
         8.073767 ,  13.31218  ], dtype=float32), np.int64(9): array([-8.604647 , -4.245255 , -3.8652053, ...,  3.891601 ,  4.3025064,
        7.495508 ], dtype=float32)}
9 

  0%|          | 0/1604 [00:00<?, ?it/s]

printing D
{np.int64(2): array([-147.31705, -145.00305, -113.18277, ...,  134.92162,  136.119  ,
        139.80762], dtype=float32), np.int64(3): array([-81.5915  , -78.98904 , -78.333015, ...,  74.32497 ,  79.13725 ,
        97.02891 ], dtype=float32), np.int64(4): array([-53.876114, -49.39669 , -49.212963, ...,  48.89883 ,  51.026356,
        58.1894  ], dtype=float32), np.int64(5): array([-31.268505, -24.796583, -24.455898, ...,  26.798277,  27.416666,
        29.794231], dtype=float32), np.int64(6): array([-19.375076, -16.42993 , -14.185225, ...,  13.978925,  14.941851,
        15.991492], dtype=float32), np.int64(7): array([-9.71762  , -7.09104  , -6.729268 , ...,  6.9316406,  7.7380195,
       10.644024 ], dtype=float32), np.int64(8): array([-7.6706586, -4.385363 , -3.9817915, ...,  4.0233674,  4.543739 ,
        7.714522 ], dtype=float32), np.int64(9): array([-4.6744328, -2.5577075, -2.309638 , ...,  2.3304358,  2.594105 ,
        5.0701323], dtype=float32)}
