In [None]:
import numpy as np
from dvclive import Live
from dvclive.keras import DVCLiveCallback
import tensorflow as tf
import pandas as pd
import os

# check if tensorflow using gpu
print(tf.config.list_physical_devices('GPU'))

2025-03-14 20:04:59.796897: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-14 20:04:59.826535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741982699.847050   54087 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741982699.853451   54087 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1741982699.873203   54087 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
IMG_SIZE = 299
BATCH_SIZE = 64
EPOCHS = 500

MAX_SEQ_LENGTH = 100
NUM_FEATURES = 2048

# get root directory
from pathlib import Path
ROOT_DIR = Path().resolve().parent  # Assumes notebook is in a subdirectory

ORI_DATA_PATH = os.path.join(ROOT_DIR, 'data', 'raw')
PROCESSED_DATA_PATH = os.path.join(ROOT_DIR, "data", "interim", "processed_ds_random")

import decord
decord.bridge.set_bridge('tensorflow')

model_save_path = os.path.join(ROOT_DIR, 'models', 'seq_model.keras')  # Specify the desired save location

## Load video

In [16]:
from datasets import load_dataset, DatasetDict, Video, Features

# split 8:1:1
ds = load_dataset("videofolder", data_dir=ORI_DATA_PATH)
ds_train_devtest = ds['train'].train_test_split(test_size=0.2, seed=42)
ds_devtest = ds_train_devtest['test'].train_test_split(test_size=0.5, seed=42)

ds = DatasetDict({
    'train': ds_train_devtest['train'],
    'valid': ds_devtest['train'],
    'test': ds_devtest['test']
})

ds

DatasetDict({
    train: Dataset({
        features: ['video', 'label'],
        num_rows: 593
    })
    valid: Dataset({
        features: ['video', 'label'],
        num_rows: 74
    })
    test: Dataset({
        features: ['video', 'label'],
        num_rows: 75
    })
})

In [17]:
label_feature = ds['train'].features['label']
label_names = label_feature.names
label_dict = {i: name for i, name in enumerate(label_names)}

print(label_dict)

{0: 'apa kabar', 1: 'ayo jalan-jalan', 2: 'jaga kesehatan', 3: 'kamu mau kemana', 4: 'kamu tinggal dimana', 5: 'mau pesan apa', 6: 'nama kamu siapa', 7: 'salam kenal', 8: 'sama-sama', 9: 'sampai jumpa lagi', 10: 'saya minta maaf', 11: 'sekarang jam berapa', 12: 'selamat malam', 13: 'selamat pagi', 14: 'selamat siang', 15: 'terima kasih'}


## Extract feature with CNN

In [9]:
class CenterSquareCrop(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(CenterSquareCrop, self).__init__(**kwargs)

    def call(self, inputs):
        # Get the dynamic shape of the input image
        shape = tf.shape(inputs)
        height = shape[1]
        width = shape[2]
        # Determine the side length of the largest possible central square
        crop_size = tf.minimum(height, width)
        # Compute offsets for centering the crop
        offset_height = (height - crop_size) // 2
        offset_width = (width - crop_size) // 2
        # Crop the central square from each image in the batch
        return tf.image.crop_to_bounding_box(inputs, offset_height, offset_width, crop_size, crop_size)

In [10]:
def build_feature_extractor(training=False):
    feature_extractor = tf.keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling='avg',
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    crop = CenterSquareCrop()
    resize = tf.keras.layers.Resizing(IMG_SIZE,IMG_SIZE)
    preprocess_input = tf.keras.applications.inception_v3.preprocess_input
    inputs = tf.keras.Input(shape=(None, None, 3))
    if training:
        augmentation = tf.keras.Sequential(
            [
                tf.keras.layers.RandomBrightness(0.2),
                tf.keras.layers.RandomContrast(0.2)
            ],
            name="augmentation"
        )
        x = augmentation(inputs)
    else:
        x = inputs
    preprocessed = preprocess_input(resize(crop(x)))

    outputs = feature_extractor(preprocessed)
    return tf.keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor(training=True)
feature_extractor_predict = build_feature_extractor()

In [11]:
feature_extractor.summary()

In [12]:
feature_extractor_predict.summary()

In [18]:
tf.keras.backend.clear_session()

### Feature extraction frame by frame

In [None]:
# Define a function to extract features from videos
def extract_features(example, split):
    frames = example["video"][:]

    # Extract features using your feature extractor
    video_length = frames.shape[0]

    # Trim or pad frames to MAX_SEQ_LENGTH
    if video_length >= MAX_SEQ_LENGTH:
        # truncate if too long
        frames = frames[:MAX_SEQ_LENGTH]
        mask = tf.ones(MAX_SEQ_LENGTH, dtype=tf.bool)
    else:
        # create mask and pad if too short
        mask = tf.zeros(MAX_SEQ_LENGTH, dtype=tf.bool)
        mask = tf.tensor_scatter_nd_update(
            mask,
            tf.reshape(tf.range(video_length), [-1, 1]),
            tf.ones(video_length, dtype=tf.bool)
        )
        # Pad with zeros
        padding = tf.zeros((MAX_SEQ_LENGTH - video_length, *frames.shape[1:]), dtype=frames.dtype)
        frames = tf.concat([frames, padding], axis=0)

    # Batch process frames using feature_extractor
    if split in ['test', 'valid']:
        batch_features = feature_extractor_predict.predict(frames, verbose=0)
    else:
        batch_features = feature_extractor.predict(frames, verbose=0)

    # Return as a dictionary
    return {
        "features": batch_features,
        "mask": mask,
        # "video_path": video_path,
        "frame_count": len(frames),
        # "file_name": os.path.basename(video_path)
    }

# Process the datasets with batching for efficiency
processed_ds = {}
for split in ds.keys():
    processed_ds[split] = ds[split].map(
        extract_features,
        split,
        remove_columns=["video"],  # Remove original video data to save space
        batched=False,  # Process one example at a time
        # num_proc=4,     # Parallel processing
    )

    # Save the processed dataset
    processed_ds[split].save_to_disk(os.path.join(PROCESSED_DATA_PATH, split))

Map: 100%|██████████| 593/593 [11:52<00:00,  1.20s/ examples]
Map: 100%|██████████| 74/74 [01:32<00:00,  1.25s/ examples]
Map: 100%|██████████| 75/75 [01:31<00:00,  1.21s/ examples]
Saving the dataset (1/1 shards): 100%|██████████| 75/75 [00:00<00:00, 1111.09 examples/s]


In [None]:
# def prepare_all_videos(ds, split):
#     num_samples = len(ds[split])
#     video_paths = [video['path'] for video in ds[split]['video']]
#     labels = ds[split]['label']

#     # Initialize arrays for features and masks
#     frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
#     frame_features = np.zeros(
#         shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
#     )

#     for idx, path in enumerate(video_paths):
#         # Load and preprocess video frames
#         frames = load_video(path)
#         video_length = frames.shape[0]

#         # Trim or pad frames to MAX_SEQ_LENGTH
#         if video_length > MAX_SEQ_LENGTH:
#             frames = frames[:MAX_SEQ_LENGTH]
#             mask = np.ones(MAX_SEQ_LENGTH, dtype="bool")
#         else:
#             padding = np.zeros((MAX_SEQ_LENGTH - video_length, *frames.shape[1:]), dtype=frames.dtype)
#             frames = np.concatenate([frames, padding], axis=0)
#             mask = np.zeros(MAX_SEQ_LENGTH, dtype="bool")
#             mask[:video_length] = 1  # Mark valid frames

#         # Batch process frames using feature_extractor
#         if split in ['test', 'valid']:
#             batch_features = feature_extractor_predict.predict(frames, verbose=0)
#         else:
#             batch_features = feature_extractor.predict(frames, verbose=0)

#         # Store features and masks
#         frame_features[idx] = batch_features
#         frame_masks[idx] = mask

#     return (frame_features, frame_masks), labels


# # Process datasets
# train_data, train_labels = prepare_all_videos(ds, "train")
# test_data, test_labels = prepare_all_videos(ds, "test")
# valid_data, valid_labels = prepare_all_videos(ds, "valid")

# print(f"Frame features in train set: {train_data[0].shape}")
# print(f"Frame masks in train set: {train_data[1].shape}")

I0000 00:00:1741792648.932406   15421 service.cc:152] XLA service 0x7f8e54003d20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741792648.933017   15421 service.cc:160]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-03-12 15:17:29.050632: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator feature_extractor_1/center_square_crop_1/crop_to_bounding_box/Assert/Assert
2025-03-12 15:17:29.050966: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator feature_extractor_1/center_square_crop_1/crop_to_bounding_box/Assert_1/Assert
2025-03-12 15:17:29.051154: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator feature_extractor_1/center_square_crop_1/crop_to_bounding_box/Assert_2/Assert
2025-03-12 15:17:29.051351: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator feature_extractor_1/center_square_crop_1/crop_to_boundi

KeyboardInterrupt: 

[220, 100, 2048]

In [None]:
# import pickle
# import os

# data_types = ['train', 'test', 'valid']
# save_dir = os.path.join(ROOT_DIR, 'data', 'intermediate')

# for data_type in data_types:
#     data_file = os.path.join(save_dir, f'{data_type}_data.pkl')
#     labels_file = os.path.join(save_dir, f'{data_type}_labels.pkl')

#     with open(data_file, 'wb') as f:
#         pickle.dump(eval(f'{data_type}_data'), f)

#     with open(labels_file, 'wb') as f:
#         pickle.dump(eval(f'{data_type}_labels'), f)

In [None]:
# # prompt: load train,test,valid _data and _labels.pkl from /content/drive/MyDrive/bisindo_data

# import pickle
# import os

# # Load the data and labels from the pickle files
# save_dir = '/content/drive/MyDrive/bisindo_data'

# train_data = pickle.load(open(os.path.join(save_dir, 'train_data.pkl'), 'rb'))
# train_labels = pickle.load(open(os.path.join(save_dir, 'train_labels.pkl'), 'rb'))

# test_data = pickle.load(open(os.path.join(save_dir, 'test_data.pkl'), 'rb'))
# test_labels = pickle.load(open(os.path.join(save_dir, 'test_labels.pkl'), 'rb'))

# valid_data = pickle.load(open(os.path.join(save_dir, 'valid_data.pkl'), 'rb'))
# valid_labels = pickle.load(open(os.path.join(save_dir, 'valid_labels.pkl'), 'rb'))

# print("Data loaded successfully!")
# print(f"Train data shape: {train_data[0].shape}")
# print(f"Train labels shape: {len(train_labels)}")

Data loaded successfully!
Train data shape: (424, 100, 2048)
Train labels shape: 424


# Training the sequence model

In [3]:
# load if needed
from datasets import load_from_disk

train_ds = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "train"))\
    .to_tf_dataset(
        columns=["features", "mask"],
        label_cols=["label"],
        shuffle=True,
        batch_size=BATCH_SIZE,
    )
valid_ds = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "valid"))\
    .to_tf_dataset(
        columns=["features", "mask"],
        label_cols=["label"],
        shuffle=True,
        batch_size=BATCH_SIZE,
    )

test_ds = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "test"))

label_feature = test_ds.features['label']
label_names = label_feature.names
label_dict = {i: name for i, name in enumerate(label_names)}

test_ds = test_ds.to_tf_dataset(
    columns=["features", "mask"],
    label_cols=["label"],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

  from .autonotebook import tqdm as notebook_tqdm


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
I0000 00:00:1741982711.459225   54087 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13949 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5


In [4]:
# Utility for our sequence model.
def get_sequence_model():
    frame_features_input = tf.keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES), name='features')
    mask_input = tf.keras.Input((MAX_SEQ_LENGTH,), dtype="bool", name='mask')

    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(frame_features_input, mask=mask_input)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.LSTM(128)(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(1024, activation="elu")(x)
    x = tf.keras.layers.Dense(512, activation='elu')(x)
    output = tf.keras.layers.Dense(len(label_names), activation="softmax")(x)

    rnn_model = tf.keras.Model(inputs=[frame_features_input, mask_input], outputs=output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(1e-4),
        metrics=["accuracy"]
    )
    return rnn_model

In [6]:
tf.keras.backend.clear_session()

In [7]:

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                    patience=50,
                                                    restore_best_weights=True,
                                                    verbose=1)

seq_model = get_sequence_model()

with Live(dir=os.path.join(ROOT_DIR, 'dvclive')) as live:
    history = seq_model.fit(
        train_ds,
        validation_data=valid_ds,
        epochs=EPOCHS,
        callbacks=[
            early_stopping,
            DVCLiveCallback(live=live)
        ],
        verbose=2
    )
    seq_model.save(model_save_path)

    loss, accuracy = seq_model.evaluate(test_ds, verbose=0)
    live.log_metric("test_loss", loss)
    live.log_metric("test_accuracy", accuracy)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

Epoch 1/500
10/10 - 6s - 637ms/step - accuracy: 0.0641 - loss: 2.7650 - val_accuracy: 0.0541 - val_loss: 2.7181
Epoch 2/500
10/10 - 3s - 348ms/step - accuracy: 0.1332 - loss: 2.6926 - val_accuracy: 0.1622 - val_loss: 2.6839
Epoch 3/500
10/10 - 5s - 504ms/step - accuracy: 0.2327 - loss: 2.5827 - val_accuracy: 0.1757 - val_loss: 2.5971
Epoch 4/500
10/10 - 5s - 472ms/step - accuracy: 0.2749 - loss: 2.4447 - val_accuracy: 0.2838 - val_loss: 2.4574
Epoch 5/500
10/10 - 3s - 343ms/step - accuracy: 0.3423 - loss: 2.2969 - val_accuracy: 0.3108 - val_loss: 2.2874
Epoch 6/500
10/10 - 5s - 462ms/step - accuracy: 0.3912 - loss: 2.1490 - val_accuracy: 0.3784 - val_loss: 2.1640
Epoch 7/500
10/10 - 3s - 315ms/step - accuracy: 0.4165 - loss: 2.0217 - val_accuracy: 0.4459 - val_loss: 1.9956
Epoch 8/500
10/10 - 3s - 330ms/step - accuracy: 0.4840 - loss: 1.8652 - val_accuracy: 0.4865 - val_loss: 1.8689
Epoch 9/500
10/10 - 3s - 331ms/step - accuracy: 0.5379 - loss: 1.6960 - val_accuracy: 0.5405 - val_loss:

	uv.lock, data/external/.gitkeep, data/interim/.gitkeep, data/interim/processed_ds_random/test/data-00000-of-00001.arrow, data/interim/processed_ds_random/test/dataset_info.json, data/interim/processed_ds_random/test/state.json, data/interim/processed_ds_random/train/data-00000-of-00001.arrow, data/interim/processed_ds_random/train/dataset_info.json, data/interim/processed_ds_random/train/state.json, data/interim/processed_ds_random/valid/data-00000-of-00001.arrow, data/interim/processed_ds_random/valid/dataset_info.json, data/interim/processed_ds_random/valid/state.json, data/processed/.gitkeep, models/seq_model.keras.dvc, models/.gitignore, notebooks/1.0-sam-cnn-lstm-experiment.ipynb, notebooks/0.0.1-sam-copy-data-to-bucket.ipynb
