In [1]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

DATASET = "segmentAnything"
FINAL_DATA_NAME = 'segmentAnything-toy-wavelet-horizontal' # + channel
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "segmentAnything-resizedBlurred-normalized"
SAVE_DF = False

data_dir = os.path.join(ROOT_DIR, 'raw-data','segmentAnything')
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(data_dir)
data_dir
BATCH_NUM = None

In [2]:
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))
freq_df = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", f"master-frequency-map.csv"), index_col= ["dataset", "transform", "group"])


In [3]:
file_list = [os.path.join(data_dir, f"mini-toy-{RAW_DATA_SUFFIX}", filename) for filename in os.listdir(data_dir)]
file_names = os.listdir(os.path.join(data_dir, f"mini-toy-{RAW_DATA_SUFFIX}"))
file_names[:5]

['sa_1192.npz', 'sa_3565.npz', 'sa_3922.npz', 'sa_7865.npz', 'sa_3799.npz']

In [4]:
'''Assuming No batching is required. Not applicable for agriVision'''

# data_dir = os.path.join(ROOT_DIR, "raw-data", "agriVision", "full-agriVision-RGB-cleaned")

# for channel in ['red', 'blue', 'green', 'gray', 'infrared']:

#     channel_fr = convert_to_fourier_basis(data_dir, channel, debug = True)
#     pd.to_pickle(channel_fr, os.path.join(ROOT_DIR, "transformed-data", f"full-agriVision-fourier-{channel}-df.pickle"))

#     min_group, max_group = 2, max(channel_fr['band'])
#     group_data_map = dict()
#     group_data_map_size = dict()
#     for group in np.arange(min_group, max_group + 1):
#         data = channel_fr[(channel_fr['band'] == group)]['data'].iloc[0]
#         group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)] 
#         group_data_map_size[group] = data.size
    
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}.pickle"))
#     pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME}-{channel}-size.pickle"))
    

'Assuming No batching is required. Not applicable for agriVision'

In [5]:
'''To split large dataset into many batches, only needs to be run once'''
# k = 10000
# target_dir = os.path.join(ROOT_DIR, 'raw-data', 'agriVision') # Where the batch{i} folders will be created
# directorySplit(folder_dir = data_dir, target_dir = target_dir, name = RAW_DATA_SUFFIX, k = k)
# print(f"{len(file_names)//k} batches created" )

'To split large dataset into many batches, only needs to be run once'

In [6]:
'''Show all subsets of data in raw data folder that have already been created'''
print(''.join([x+"\n" for x in os.listdir(data_dir) if x.__contains__(RAW_DATA_SUFFIX)]))


toy-segmentAnything-resizedBlurred-normalized
._toy-segmentAnything-resizedBlurred-normalized
mini4k-toy-segmentAnything-resizedBlurred-normalized
mini-toy-segmentAnything-resizedBlurred-normalized



In [6]:
def npz_opener_pickle(path):
    image = Image.open(path).convert('RGB')
    return np.array(image)

# Wavelet

In [7]:
FINAL_DATA_NAME = 'segmentAnything-toy-wavelet'
if BATCH_NUM is None:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "segmentAnything", f"mini-toy-{RAW_DATA_SUFFIX}")
else:
    batch_dir = os.path.join(ROOT_DIR, "raw-data", "segmentAnything", f"batch{BATCH_NUM}-{RAW_DATA_SUFFIX}")

In [None]:
TRANSFORM = "wavelet"
channel = "red"

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[DATASET, TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


10 layers being used


100%|██████████| 2000/2000 [01:06<00:00, 30.05it/s]


printing H
{np.int64(2): array([-828.55334, -777.9856 , -768.7299 , ...,  775.6144 ,  790.81   ,
        936.8076 ], shape=(2000,), dtype=float32), np.int64(3): array([-491.78625, -487.4613 , -476.7782 , ...,  476.96347,  485.96426,
        486.88596], shape=(8000,), dtype=float32), np.int64(4): array([-317.74973, -277.5523 , -277.29834, ...,  283.5554 ,  312.4923 ,
        320.78683], shape=(32000,), dtype=float32), np.int64(5): array([-166.25873, -166.1253 , -164.81894, ...,  171.96962,  177.49973,
        177.79565], shape=(100000,), dtype=float32), np.int64(6): array([-98.31728 , -81.374115, -79.26432 , ...,  78.6924  ,  82.86543 ,
        90.43092 ], shape=(100000,), dtype=float32), np.int64(7): array([-45.28487 , -40.257282, -38.95217 , ...,  40.623047,  43.125202,
        50.056427], shape=(100000,), dtype=float32), np.int64(8): array([-24.556046, -20.54927 , -19.61491 , ...,  19.727964,  20.585846,
        24.0473  ], shape=(100000,), dtype=float32), np.int64(9): array([-12.795

100%|██████████| 2000/2000 [01:03<00:00, 31.29it/s]


printing V
{np.int64(2): array([-617.8318 , -617.3472 , -563.61774, ...,  669.82025,  680.73285,
        705.4555 ], shape=(2000,), dtype=float32), np.int64(3): array([-422.30756, -382.0876 , -377.53787, ...,  407.73395,  509.1729 ,
        521.9833 ], shape=(8000,), dtype=float32), np.int64(4): array([-308.50195, -268.4269 , -241.42255, ...,  245.71071,  249.75987,
        251.7161 ], shape=(32000,), dtype=float32), np.int64(5): array([-166.88718, -166.1066 , -156.64787, ...,  160.39569,  161.56781,
        161.84462], shape=(100000,), dtype=float32), np.int64(6): array([-81.427315, -78.0972  , -77.0581  , ...,  78.83871 ,  81.60604 ,
        88.538925], shape=(100000,), dtype=float32), np.int64(7): array([-46.267445, -40.75439 , -39.206543, ...,  38.987087,  41.48758 ,
        47.637398], shape=(100000,), dtype=float32), np.int64(8): array([-23.372301, -20.509369, -19.45863 , ...,  19.428703,  20.299885,
        24.492868], shape=(100000,), dtype=float32), np.int64(9): array([-12.748

In [None]:
TRANSFORM = "wavelet"
channel = "green"

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[DATASET, TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


10 layers being used


100%|██████████| 2000/2000 [01:10<00:00, 28.27it/s]


printing H
{np.int64(2): array([-887.8562 , -843.32776, -758.5118 , ...,  774.7783 ,  927.0954 ,
        974.49615], shape=(2000,), dtype=float32), np.int64(3): array([-491.3666 , -490.6369 , -466.63266, ...,  541.56323,  541.74774,
        584.66364], shape=(8000,), dtype=float32), np.int64(4): array([-367.69183, -327.91757, -324.7943 , ...,  334.65918,  344.1061 ,
        344.50693], shape=(32000,), dtype=float32), np.int64(5): array([-196.94052, -194.46771, -182.09634, ...,  188.79276,  192.39694,
        194.32458], shape=(100000,), dtype=float32), np.int64(6): array([-112.50255 ,  -95.018   ,  -93.69904 , ...,   92.26968 ,
         93.802605,  106.2917  ], shape=(100000,), dtype=float32), np.int64(7): array([-53.776512, -48.33653 , -46.452854, ...,  48.197792,  49.676983,
        59.04454 ], shape=(100000,), dtype=float32), np.int64(8): array([-28.938997, -24.925446, -23.557632, ...,  23.54591 ,  24.51901 ,
        28.868652], shape=(100000,), dtype=float32), np.int64(9): array([-

100%|██████████| 2000/2000 [01:03<00:00, 31.70it/s]


printing V
{np.int64(2): array([-650.8697 , -647.53217, -629.22845, ...,  714.1395 ,  733.45496,
        740.5868 ], shape=(2000,), dtype=float32), np.int64(3): array([-468.45355, -455.6865 , -427.29608, ...,  493.72253,  510.34195,
        588.08167], shape=(8000,), dtype=float32), np.int64(4): array([-348.16266, -290.35428, -283.76385, ...,  287.36072,  299.31564,
        302.46576], shape=(32000,), dtype=float32), np.int64(5): array([-182.70355, -180.40074, -176.26262, ...,  193.09697,  197.5746 ,
        205.2451 ], shape=(100000,), dtype=float32), np.int64(6): array([-99.6754  , -93.12749 , -91.18324 , ...,  94.663345,  98.400406,
       108.13532 ], shape=(100000,), dtype=float32), np.int64(7): array([-54.58352 , -48.287678, -46.128487, ...,  46.0855  ,  48.62572 ,
        56.463284], shape=(100000,), dtype=float32), np.int64(8): array([-28.759228, -24.343147, -23.291887, ...,  23.08476 ,  23.975792,
        28.946814], shape=(100000,), dtype=float32), np.int64(9): array([-15.468

In [None]:
TRANSFORM = "wavelet"
channel = "blue"

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[DATASET, TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


10 layers being used


100%|██████████| 2000/2000 [01:06<00:00, 29.99it/s]


printing H
{np.int64(2): array([-772.83  , -765.4451, -763.1141, ...,  717.9474,  782.2642,
       1035.2745], shape=(2000,), dtype=float32), np.int64(3): array([-563.2518 , -504.9972 , -469.08127, ...,  478.35974,  531.23755,
        554.7824 ], shape=(8000,), dtype=float32), np.int64(4): array([-379.6117 , -321.90048, -315.689  , ...,  293.42908,  301.57712,
        337.38736], shape=(32000,), dtype=float32), np.int64(5): array([-179.7652 , -175.84215, -165.52469, ...,  166.15033,  172.78447,
        189.34677], shape=(100000,), dtype=float32), np.int64(6): array([-97.23192, -90.2482 , -88.22941, ...,  83.32602,  87.67813,
        94.908  ], shape=(100000,), dtype=float32), np.int64(7): array([-46.656143, -43.53821 , -42.61172 , ...,  43.721115,  44.876663,
        50.541588], shape=(100000,), dtype=float32), np.int64(8): array([-25.227205, -22.471666, -21.357141, ...,  20.998169,  21.86654 ,
        25.299   ], shape=(100000,), dtype=float32), np.int64(9): array([-13.415307 , -10.91

100%|██████████| 2000/2000 [01:09<00:00, 28.68it/s]


printing V
{np.int64(2): array([-848.7576 , -667.5582 , -613.92584, ...,  697.5296 ,  709.3108 ,
        733.74457], shape=(2000,), dtype=float32), np.int64(3): array([-487.10007, -454.3274 , -406.25726, ...,  452.57452,  501.36975,
        501.43793], shape=(8000,), dtype=float32), np.int64(4): array([-329.99106, -327.27502, -279.76526, ...,  318.29196,  325.396  ,
        337.48993], shape=(32000,), dtype=float32), np.int64(5): array([-159.90677, -156.11534, -154.56407, ...,  167.5875 ,  171.84175,
        174.56865], shape=(100000,), dtype=float32), np.int64(6): array([-94.57467 , -88.40889 , -86.109764, ...,  86.0619  ,  89.745636,
        99.992584], shape=(100000,), dtype=float32), np.int64(7): array([-48.31938 , -44.02026 , -41.890358, ...,  42.237362,  43.85616 ,
        47.615112], shape=(100000,), dtype=float32), np.int64(8): array([-24.379347, -21.534918, -20.615267, ...,  20.225525,  21.162271,
        25.206692], shape=(100000,), dtype=float32), np.int64(9): array([-13.545

In [None]:
TRANSFORM = "wavelet"
channel = "gray"

for orientation_label in ['horizontal', 'vertical']:
    FINAL_DATA_NAME_ORIENTED = f"{FINAL_DATA_NAME}-{orientation_label}"
    orientation_code = 'H' if orientation_label == 'horizontal' else 'V'

    channel_wv = convert_to_wavelet_basis(batch_dir, channel, debug=True, image_opener=npz_opener)
    channel_wv['data'] = channel_wv['data'].apply(lambda x: x.astype(np.float32))

    if SAVE_DF:
        if BATCH_NUM is None:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))
        else:
            pd.to_pickle(channel_wv, os.path.join(ROOT_DIR, "transformed-data", f"dataframes/batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-df.pickle"))

    min_group, max_group = 2, max(channel_wv['layer'])
    group_data_map = dict()
    group_data_map_size = dict()

    for group in np.arange(min_group, max_group + 1):
        data = channel_wv[(channel_wv['orientation'] == orientation_code) & (channel_wv['layer'] == group)]['data'].iloc[0]
        group_data_map[group] = np.sort(data)[np.round(np.linspace(0, data.size - 1, min(data.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
        group_data_map_size[group] = data.size

        freq_df.loc[DATASET, TRANSFORM, group] = channel_wv[(channel_wv['layer'] == group) & (channel_wv['orientation'] == orientation_code)]['frequency'].iloc[0]

    print(f"printing {orientation_code}")
    freq_df.to_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv"))

    if BATCH_NUM is None:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))
    else:
        pd.to_pickle(group_data_map, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}.pickle"))
        pd.to_pickle(group_data_map_size, os.path.join(ROOT_DIR, "transformed-data", f"batch{BATCH_NUM}{FINAL_DATA_NAME_ORIENTED}-{channel}-size.pickle"))

    print(group_data_map)

    del channel_wv
    del group_data_map
    del group_data_map_size


10 layers being used


100%|██████████| 2000/2000 [01:01<00:00, 32.52it/s]


printing H
{np.int64(2): array([-829.9487 , -812.94025, -707.7685 , ...,  729.36505,  903.2977 ,
        926.42365], shape=(2000,), dtype=float32), np.int64(3): array([-472.2292 , -454.1876 , -436.49988, ...,  487.21234,  511.92654,
        549.0133 ], shape=(8000,), dtype=float32), np.int64(4): array([-347.50717, -305.07553, -298.34656, ...,  304.11325,  319.56082,
        328.84515], shape=(32000,), dtype=float32), np.int64(5): array([-175.93819, -168.60172, -168.18925, ...,  181.03627,  182.42055,
        182.53221], shape=(100000,), dtype=float32), np.int64(6): array([-104.40869,  -88.57438,  -86.35828, ...,   85.33793,   87.22171,
         99.93777], shape=(100000,), dtype=float32), np.int64(7): array([-49.880054, -44.1313  , -42.472702, ...,  44.51042 ,  46.495   ,
        55.38275 ], shape=(100000,), dtype=float32), np.int64(8): array([-27.105087, -23.092379, -21.80031 , ...,  21.828737,  22.740433,
        26.60471 ], shape=(100000,), dtype=float32), np.int64(9): array([-14.153

 67%|██████▋   | 1338/2000 [00:58<00:30, 21.64it/s]