# Dataset construction

- Split into train and test set
- For a given example, store the following data structure
  ```
  {
      point_cloud_embeds,
      image_embeds
  }
  ```

In [1]:
import pickle
import pandas as pd
import numpy as np
import torch

## Load the embeddings' files

In [5]:
with open("/kaggle/input/modelnet-minimal/pointnet_global_features_full.pkl", "rb") as f:
    pcl_data = pickle.load(f)
    
print(f"Number of samples: {len(pcl_data.keys())}")

Number of samples: 2468


In [6]:
with open("/kaggle/input/modelnet-minimal/clip_image_global_features_full.pkl", "rb") as f:
    img_data = pickle.load(f)
    
print(f"Number of samples: {len(img_data.keys())}")

Number of samples: 2468


## Load merged data into the required structure

In [9]:
consolidated_data = []

all_example_names = list(pcl_data.keys())

for i in range(len(all_example_names)):
    _id = all_example_names[i]
    pcl = pcl_data[_id]
    img = img_data[_id]
    data_point = [pcl, img]
    consolidated_data.append(data_point)

In [10]:
print(f"Length of consolidated dataset: {len(consolidated_data)}")

Length of consolidated dataset: 2468


In [11]:
consolidated_data[:10]

[[tensor([[1.4797, 0.7004, 0.3281,  ..., 4.4995, 2.2713, 4.2037]]),
  tensor([[0.1345, 0.3158, 0.4696,  ..., 0.2732, 0.6880, 0.1298]])],
 [tensor([[2.6007, 1.2874, 0.3578,  ..., 3.4704, 1.9024, 4.6927]]),
  tensor([[ 0.2190,  0.2551,  0.3379,  ..., -0.1564,  0.6762,  0.2228]])],
 [tensor([[2.4903, 1.2754, 0.3442,  ..., 3.6346, 2.1892, 4.5531]]),
  tensor([[0.2147, 0.3281, 0.2672,  ..., 0.0413, 0.7094, 0.3101]])],
 [tensor([[2.8172, 1.3345, 0.3876,  ..., 3.8463, 2.4627, 2.9878]]),
  tensor([[0.0358, 0.1614, 0.2263,  ..., 0.2339, 0.6726, 0.1809]])],
 [tensor([[2.1773, 1.2184, 0.3115,  ..., 3.6429, 2.4430, 4.3618]]),
  tensor([[ 0.1880,  0.0195,  0.3718,  ...,  0.3678,  0.7224, -0.1704]])],
 [tensor([[3.0451, 1.2214, 0.3677,  ..., 3.7142, 2.6362, 4.5630]]),
  tensor([[ 0.1096,  0.0705,  0.6350,  ..., -0.5381,  0.9787, -0.3175]])],
 [tensor([[2.9300, 1.1066, 0.3730,  ..., 3.9388, 1.7414, 5.4242]]),
  tensor([[-0.8032, -0.2264,  0.4207,  ..., -0.0085, -0.1298, -0.1435]])],
 [tensor([[2.3923

## Split into train and test sections and store as pickle files

In [12]:
train_data = consolidated_data[:2048]
test_data = consolidated_data[2048:]
print(f"Train/Test split: {len(train_data)}/{len(test_data)}")

Train/Test split: 2048/420


In [13]:
with open("consolidated_feat_train.pkl", "wb") as f:
    pickle.dump(train_data, f)

In [14]:
with open("consolidated_feat_test.pkl", "wb") as f:
    pickle.dump(test_data, f)

## Recon on the embedding sizes

In [24]:
max_value_img = -1
max_value_pcl = -1
min_value_img = +1
min_value_pcl = +1

for dp in train_data:
    pcl_emb = dp[0]
    img_emb = dp[1]
    pcl_min = torch.min(pcl_emb)
    pcl_max = torch.max(pcl_emb)
    img_min = torch.min(img_emb)
    img_max = torch.max(img_emb)
    max_value_img = max(max_value_img, img_max)
    min_value_img = min(min_value_img, img_min)
    max_value_pcl = max(max_value_pcl, pcl_max)
    min_value_pcl = min(min_value_pcl, pcl_min)

In [25]:
print(f"Range of img embeds: {min_value_img} to {max_value_img}")
print(f"Range of pcl embeds: {min_value_pcl} to {max_value_pcl}")

Range of img embeds: -7.734566688537598 to 8.66004753112793
Range of pcl embeds: -0.23877529799938202 to 7.327340126037598
