# Importing Dependencies

In [33]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from sklearn.model_selection import KFold
import tensorflow as tf

# Hyperparameters

In [34]:
class CFG:
    seed = 42  # Random seed for reproducibility
    n_splits = 4  # Number of folds for K-Fold cross-validation

    # URLs for dataset
    label_dict_url = '/kaggle/input/asl-signs/sign_to_prediction_index_map.json'
    train_csv_url = '/kaggle/input/asl-signs/train.csv'
    parquet_base_url = '/kaggle/input/asl-signs/'
    output_base_url = '/tmp/'  # Base URL for output TFRecord files

N_FILES = None  # Will be initialized after loading the dataset
ROWS_PER_FRAME = 543
CHUNK_SIZE = 512  # Number of rows per chunk for parallel processing
N_PART = 1  # Number of partitions for chunk processing
FOLD = 4  # Total number of folds
part = 0  # Current partition index

# Loading Dataset

In [35]:
# Load label dictionary
with open(CFG.label_dict_url) as json_file:
    LABEL_DICT = json.load(json_file)

# Load training data
train_df = pd.read_csv(CFG.train_csv_url)
N_FILES = len(train_df)  # Total number of training files

# Dataset Preprocessing

In [36]:
# Function to load and reshape data from parquet files
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

# Function to encode a single row into TFRecord format
def encode_row(row):
    coordinates = load_relevant_data_subset(f"{CFG.parquet_base_url}{row.path}")
    coordinates_encoded = coordinates.tobytes()
    participant_id = int(row.participant_id)
    sequence_id = int(row.sequence_id)
    sign = int(LABEL_DICT[row.sign])

    record_bytes = tf.train.Example(features=tf.train.Features(feature={
        'coordinates': tf.train.Feature(bytes_list=tf.train.BytesList(value=[coordinates_encoded])),
        'participant_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[participant_id])),
        'sequence_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[sequence_id])),
        'sign': tf.train.Feature(int64_list=tf.train.Int64List(value=[sign]))
    })).SerializeToString()
    return record_bytes

# Function to process a chunk of data and save it as TFRecord files
def process_chunk(chunk, tfrecord_name):
    options = tf.io.TFRecordOptions(compression_type='GZIP', compression_level=9)
    with tf.io.TFRecordWriter(tfrecord_name, options=options) as file_writer:
        for i, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Processing {tfrecord_name}"):
            record_bytes = encode_row(row)
            file_writer.write(record_bytes)

# Function to split a dataframe into chunks of specified size
def split_dataframe(df, chunk_size=10000): 
    return [df[i * chunk_size:(i + 1) * chunk_size] for i in range((len(df) + chunk_size - 1) // chunk_size)]

## K-Fold Cross-Validation

In [37]:
# Initialize K-Fold
train_folds = train_df.copy()
train_folds['fold'] = -1  # Placeholder column for fold assignments

kfold = KFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed) 
print(f'{CFG.n_splits}-fold training with {len(train_folds)} samples')

# Assign folds
for fold_idx, (train_idx, valid_idx) in enumerate(kfold.split(train_folds)):
    train_folds.loc[valid_idx, 'fold'] = fold_idx
    print(f'Fold {fold_idx}: Train {len(train_idx)}, Validation {len(valid_idx)}')

# Ensure all folds are assigned
assert not (train_folds['fold'] == -1).any(), "Some rows were not assigned to any fold"
assert len(train_folds['fold'].unique()) == CFG.n_splits, "Mismatch in the number of folds"

# Process each fold
for fold in range(CFG.n_splits):
    rows = train_folds[train_folds['fold'] == fold]  # Get rows for the current fold
    chunks = split_dataframe(rows, CHUNK_SIZE)  # Split into chunks

    # Partition handling
    part_size = len(chunks) // N_PART
    last = (part + 1) * part_size if part != N_PART - 1 else len(chunks)
    chunks = chunks[part * part_size:last]

    # Process and save chunks in parallel
    N = [len(x) for x in chunks]
    _ = Parallel(n_jobs=cpu_count())(
        delayed(process_chunk)(x, f"{CFG.output_base_url}fold{fold}-{i}-{n}.tfrecords")
        for i, (x, n) in enumerate(zip(chunks, N))
    )

4-fold training with 94477 samples
Fold 0: Train 70857, Validation 23620
Fold 1: Train 70858, Validation 23619
Fold 2: Train 70858, Validation 23619
Fold 3: Train 70858, Validation 23619


Processing /tmp/fold0-0-512.tfrecords: 100%|██████████| 512/512 [00:17<00:00, 29.78it/s]
Processing /tmp/fold0-4-512.tfrecords:   1%|          | 4/512 [00:00<00:13, 38.88it/s]
Processing /tmp/fold0-5-512.tfrecords:   1%|▏         | 7/512 [00:00<00:07, 65.10it/s]
Processing /tmp/fold0-6-512.tfrecords:   0%|          | 0/512 [00:00<?, ?it/s]
Processing /tmp/fold0-4-512.tfrecords: 100%|██████████| 512/512 [00:16<00:00, 31.16it/s]
Processing /tmp/fold0-8-512.tfrecords:   3%|▎         | 15/512 [00:00<00:12, 41.11it/s]
Processing /tmp/fold0-8-512.tfrecords:   4%|▍         | 20/512 [00:00<00:14, 34.53it/s]
Processing /tmp/fold0-10-512.tfrecords:   0%|          | 0/512 [00:00<?, ?it/s]
Processing /tmp/fold0-11-512.tfrecords:  97%|█████████▋| 496/512 [00:15<00:00, 36.69it/s]
Processing /tmp/fold0-12-512.tfrecords:   0%|          | 0/512 [00:00<?, ?it/s]
Processing /tmp/fold0-10-512.tfrecords:  97%|█████████▋| 495/512 [00:16<00:00, 26.50it/s]
Processing /tmp/fold0-12-512.tfrecords:  12%|█▏      