<a href="https://colab.research.google.com/github/takayama-rado/trado_samples/blob/main/colab_files/gislr_create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Download original GISLR dataset

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# Copy to local.
!cp drive/MyDrive/Datasets/gislr_dataset_orig.zip gislr_orig.zip

In [3]:
!unzip -o gislr_orig.zip

Archive:  gislr_orig.zip
   creating: inputs/google_islr/
  inflating: inputs/google_islr/sign_to_prediction_index_map.json  
  inflating: inputs/google_islr/train.csv  
   creating: inputs/google_islr/train_landmark_files/
   creating: inputs/google_islr/train_landmark_files/16069/
  inflating: inputs/google_islr/train_landmark_files/16069/100015657.parquet  
   creating: inputs/google_islr/train_landmark_files/18796/
  inflating: inputs/google_islr/train_landmark_files/18796/1001373962.parquet  
   creating: inputs/google_islr/train_landmark_files/2044/
  inflating: inputs/google_islr/train_landmark_files/2044/1001950812.parquet  
   creating: inputs/google_islr/train_landmark_files/22343/
  inflating: inputs/google_islr/train_landmark_files/22343/1000638205.parquet  
   creating: inputs/google_islr/train_landmark_files/25571/
  inflating: inputs/google_islr/train_landmark_files/25571/1000210073.parquet  
   creating: inputs/google_islr/train_landmark_files/26734/
  inflating: inputs

In [4]:
!ls inputs/google_islr

sign_to_prediction_index_map.json  train.csv  train_landmark_files


In [5]:
!cat inputs/google_islr/sign_to_prediction_index_map.json

{"TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8, "arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17, "bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26, "boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35, "car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43, "chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51, "cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60, "down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69, "empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78, "

In [6]:
!cat inputs/google_islr/train.csv | head

path,participant_id,sequence_id,sign
train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow
train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait
train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie
train_landmark_files/26734/1000241583.parquet,26734,1000241583,duck
train_landmark_files/26734/1000255522.parquet,26734,1000255522,minemy
train_landmark_files/32319/1000278229.parquet,32319,1000278229,lips
train_landmark_files/37055/100035691.parquet,37055,100035691,flower


In [7]:
!ls inputs/google_islr/train_landmark_files

16069  2044   25571  27610  29302  32319  36257  37779	49445  55372  62590
18796  22343  26734  28656  30680  34503  37055  4718	53618  61333


In [8]:
!ls inputs/google_islr/train_landmark_files/16069

100015657.parquet


# 2. Convert to HDF5 dataset

## 2.1 Import modules

In [9]:
# Standard modules.
import os
import json

# 3rd party's modules.
import numpy as np
import h5py
import pandas as pd

## 2.2 Define loading function

In [10]:
ROWS_PER_FRAME = 543  # Number of landmarks per frame.

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

## 2.3 Main process

In [11]:
# Load data definition.
root_dir = "inputs/google_islr"
track_info = pd.read_csv(os.path.join(root_dir, "train.csv"))
print(track_info)

                                                path  participant_id  \
0      train_landmark_files/26734/1000035562.parquet           26734   
1      train_landmark_files/28656/1000106739.parquet           28656   
2       train_landmark_files/16069/100015657.parquet           16069   
3      train_landmark_files/25571/1000210073.parquet           25571   
4      train_landmark_files/62590/1000240708.parquet           62590   
...                                              ...             ...   
94472   train_landmark_files/53618/999786174.parquet           53618   
94473   train_landmark_files/26734/999799849.parquet           26734   
94474   train_landmark_files/25571/999833418.parquet           25571   
94475   train_landmark_files/29302/999895257.parquet           29302   
94476   train_landmark_files/36257/999962374.parquet           36257   

       sequence_id    sign  
0       1000035562    blow  
1       1000106739    wait  
2        100015657   cloud  
3       1000210073 

In [12]:
# Extract unique participant ids.
pids = np.array(track_info["participant_id"])
upids = np.unique(pids)
print(upids)

[ 2044  4718 16069 18796 22343 25571 26734 27610 28656 29302 30680 32319
 34503 36257 37055 37779 49445 53618 55372 61333 62590]


In [13]:
# Load sign dictionary.
dictfile = os.path.join(root_dir, "sign_to_prediction_index_map.json")
with open(dictfile, "r") as fread:
    dictionary = json.load(fread)
print(dictionary)

{'TV': 0, 'after': 1, 'airplane': 2, 'all': 3, 'alligator': 4, 'animal': 5, 'another': 6, 'any': 7, 'apple': 8, 'arm': 9, 'aunt': 10, 'awake': 11, 'backyard': 12, 'bad': 13, 'balloon': 14, 'bath': 15, 'because': 16, 'bed': 17, 'bedroom': 18, 'bee': 19, 'before': 20, 'beside': 21, 'better': 22, 'bird': 23, 'black': 24, 'blow': 25, 'blue': 26, 'boat': 27, 'book': 28, 'boy': 29, 'brother': 30, 'brown': 31, 'bug': 32, 'bye': 33, 'callonphone': 34, 'can': 35, 'car': 36, 'carrot': 37, 'cat': 38, 'cereal': 39, 'chair': 40, 'cheek': 41, 'child': 42, 'chin': 43, 'chocolate': 44, 'clean': 45, 'close': 46, 'closet': 47, 'cloud': 48, 'clown': 49, 'cow': 50, 'cowboy': 51, 'cry': 52, 'cut': 53, 'cute': 54, 'dad': 55, 'dance': 56, 'dirty': 57, 'dog': 58, 'doll': 59, 'donkey': 60, 'down': 61, 'drawer': 62, 'drink': 63, 'drop': 64, 'dry': 65, 'dryer': 66, 'duck': 67, 'ear': 68, 'elephant': 69, 'empty': 70, 'every': 71, 'eye': 72, 'face': 73, 'fall': 74, 'farm': 75, 'fast': 76, 'feet': 77, 'find': 78, '

In [14]:
# Conversion.
convert_to_channel_first = True
outdir = "dataset"
os.makedirs(outdir, exist_ok=True)

for upid in upids:
    temp_info = track_info[track_info["participant_id"] == upid]
    outpath = os.path.join(outdir, str(upid) + ".hdf5")
    with h5py.File(outpath, "w") as fwrite:
        for info in temp_info.itertuples(index=False):
            path = info[0]
            pid = info[1]
            sid = info[2]
            token = np.array([dictionary[info[3]]])
            assert pid == upid, f"{pid}:{upid}"
            track_path = os.path.join(root_dir, path)
            if not os.path.exists(track_path):
                continue
            track = load_relevant_data_subset(track_path)
            # `[T, J, C] -> [C, T, J]`
            if convert_to_channel_first:
                track = track.transpose([2, 0, 1])

            # Create group.
            grp = fwrite.create_group(str(sid))
            grp.create_dataset("feature", data=track)
            grp.create_dataset("token", data=token)

# 3. Check output files

In [15]:
!ls dataset

16069.hdf5  22343.hdf5	27610.hdf5  30680.hdf5	36257.hdf5  4718.hdf5	55372.hdf5
18796.hdf5  25571.hdf5	28656.hdf5  32319.hdf5	37055.hdf5  49445.hdf5	61333.hdf5
2044.hdf5   26734.hdf5	29302.hdf5  34503.hdf5	37779.hdf5  53618.hdf5	62590.hdf5


In [16]:
# Test loading.
with h5py.File("dataset/16069.hdf5", "r") as fread:
    keys = fread.keys()
    print("Groups:", keys)
    for key in keys:
        data = fread[key]
        print("Data in a group:", data.keys())
        feature = data["feature"][:]
        token = data["token"][:]
        print(feature.shape)
        print(token)

Groups: <KeysViewHDF5 ['100015657']>
Data in a group: <KeysViewHDF5 ['feature', 'token']>
(3, 105, 543)
[48]
