## **Metadata Creation**
This notebook will be used to create the metadata files for training and validation on the reduced dataset

In [1]:
import sys
sys.path.append("/home/jupyter-group3/reconstruction/reconstruction-deep-network")

In [2]:
import numpy as np
import os
import torch 

import reconstruction_deep_network
from reconstruction_deep_network.data_loader.custom_loader import CustomDataLoader


In [3]:
module_path = reconstruction_deep_network.__path__[0]
root_dir = os.path.dirname(module_path)
data_path = os.path.join(root_dir, "data", "v1")
scans_dir = os.path.join(data_path, "scans")
all_scans = [scan for scan in os.listdir(scans_dir) if not scan.endswith(".ipynb_checkpoints")]

In [4]:
all_scans

['8WUmhLawc2A',
 '29hnd4uzFmX',
 'EDJbREhghzL',
 'EU6Fwq7SyZv',
 'D7N2EKCX4Sj',
 'dhjEzFoUFzH',
 'e9zR4mvMWw7',
 '1pXnuDYAj8r',
 'E9uDoFAP3SH',
 'ARNzJeq3xxb',
 '8194nk5LbLH',
 'b8cTxDM8gDG',
 'B6ByNegPMKs',
 '2azQ1b91cZZ',
 '2t7WUuJeko7',
 'fzynW3qQPVF',
 '1LXtFkjw3qL',
 '5ZKStnWn8Zo',
 '5LpN3gDmAk7',
 '17DRP5sb8fy',
 '759xd9YjKW5',
 'ac26ZMwG7aT',
 '7y3sRwLe3Va',
 'D7G3Y4RVNrH',
 'aayBHfsNo7d',
 'cV4RVeZvu5T',
 'GdvgFV5R1Z5',
 '5q7pvUzZiYa',
 '2n8kARJN3HM',
 '82sE5b5pLXE']

In [5]:
dataset = CustomDataLoader(mode = "train", debug = False)

In [6]:
metadata_file = dataset.metadata
print(f"Length of metadata: {len(metadata_file)}")

Length of metadata: 9820


In [7]:
def find_scan_id(file_name: str):
    scan_id = file_name.split("/")[0]
    return scan_id
metadata_tuples = [(find_scan_id(file_list[0]), file_list) for file_list in metadata_file]
print(f"Length of metadata: {len(metadata_tuples)}")

Length of metadata: 9820


In [8]:
metadata_tuples[-1]

('sT4fr6TAbpF',
 ['sT4fr6TAbpF/matterport_skybox_images/7d41ce8de085471f9d97bbff6b0c1831_skybox0_sami.jpg',
  'sT4fr6TAbpF/matterport_skybox_images/7d41ce8de085471f9d97bbff6b0c1831_skybox1_sami.jpg',
  'sT4fr6TAbpF/matterport_skybox_images/7d41ce8de085471f9d97bbff6b0c1831_skybox2_sami.jpg',
  'sT4fr6TAbpF/matterport_skybox_images/7d41ce8de085471f9d97bbff6b0c1831_skybox3_sami.jpg',
  'sT4fr6TAbpF/matterport_skybox_images/7d41ce8de085471f9d97bbff6b0c1831_skybox4_sami.jpg',
  'sT4fr6TAbpF/matterport_skybox_images/7d41ce8de085471f9d97bbff6b0c1831_skybox5_sami.jpg'])

In [9]:
reduced_metadata = [metadata_tuple[1] for metadata_tuple in metadata_tuples if metadata_tuple[0] in all_scans]

In [10]:
print(f"Length of reduced metadata: {len(reduced_metadata)}")

Length of reduced metadata: 3781


In [11]:
# save reduced dataset
reduced_file_name = os.path.join(module_path, "data_loader", "reduced.npy")
with open(reduced_file_name, 'wb') as f:
    np.save(f, reduced_metadata)

## **Split into Train and Validation**

In [12]:
# seed random values
np.random.seed(42)

In [13]:
index_list = np.arange(len(reduced_metadata))
index_list.max()

3780

In [14]:
print(f"Number of scans: {len(all_scans)}")

Number of scans: 30


In [15]:
n_train_scans = 27
n_val_scans = 3

choice_array = np.array(all_scans)
train_scans = np.random.choice(choice_array, n_train_scans, replace=False).tolist()
choice_list = choice_array.tolist()
val_scans = list(set(choice_list) - set(train_scans))

In [16]:
train_scans

['5q7pvUzZiYa',
 'fzynW3qQPVF',
 'D7G3Y4RVNrH',
 '5ZKStnWn8Zo',
 'E9uDoFAP3SH',
 'ARNzJeq3xxb',
 '2n8kARJN3HM',
 'aayBHfsNo7d',
 'B6ByNegPMKs',
 '8WUmhLawc2A',
 'D7N2EKCX4Sj',
 '1LXtFkjw3qL',
 'dhjEzFoUFzH',
 '2azQ1b91cZZ',
 'b8cTxDM8gDG',
 '7y3sRwLe3Va',
 '29hnd4uzFmX',
 'EDJbREhghzL',
 'cV4RVeZvu5T',
 'EU6Fwq7SyZv',
 'ac26ZMwG7aT',
 'GdvgFV5R1Z5',
 '5LpN3gDmAk7',
 '82sE5b5pLXE',
 '759xd9YjKW5',
 '1pXnuDYAj8r',
 '8194nk5LbLH']

In [17]:
val_scans

['e9zR4mvMWw7', '2t7WUuJeko7', '17DRP5sb8fy']

In [18]:
## verify intersection
train_set = set(train_scans)
val_set = set(val_scans)

intersection = train_set.intersection(val_set)
print(intersection)

set()


In [19]:
train_dataset = [metadata_tuple[1] for metadata_tuple in metadata_tuples if metadata_tuple[0] in train_scans]
val_dataset = [metadata_tuple[1] for metadata_tuple in metadata_tuples if metadata_tuple[0] in val_scans]

In [20]:
print(f"Length of train dataset: {len(train_dataset)}")
print(f"Length of val dataset: {len(val_dataset)}")

Length of train dataset: 3605
Length of val dataset: 176


In [21]:
val_dataset[-1]

['e9zR4mvMWw7/matterport_skybox_images/fa7122ef2199445485623708ea54d018_skybox0_sami.jpg',
 'e9zR4mvMWw7/matterport_skybox_images/fa7122ef2199445485623708ea54d018_skybox1_sami.jpg',
 'e9zR4mvMWw7/matterport_skybox_images/fa7122ef2199445485623708ea54d018_skybox2_sami.jpg',
 'e9zR4mvMWw7/matterport_skybox_images/fa7122ef2199445485623708ea54d018_skybox3_sami.jpg',
 'e9zR4mvMWw7/matterport_skybox_images/fa7122ef2199445485623708ea54d018_skybox4_sami.jpg',
 'e9zR4mvMWw7/matterport_skybox_images/fa7122ef2199445485623708ea54d018_skybox5_sami.jpg']

In [22]:
## save dataset
train_set_file_name = os.path.join(module_path, "data_loader", "ir-20231129-train-split.npy")
with open(train_set_file_name, 'wb') as f:
    np.save(f, train_dataset)

val_set_file_name = os.path.join(module_path, "data_loader", "ir-20231129-val-split.npy")
with open(val_set_file_name, 'wb') as f:
    np.save(f, val_dataset)

In [23]:
## load to verify
tr_set = np.load(train_set_file_name).astype(str)
vl_set = np.load(val_set_file_name).astype(str)

print(f"Train: {len(tr_set)}")
print(f"Val: {len(vl_set)}")

Train: 3605
Val: 176
