In [2]:
pip install --upgrade bottleneck

Collecting bottleneck
  Obtaining dependency information for bottleneck from https://files.pythonhosted.org/packages/e1/ab/92c1292d7abcd424936f24470afc70a62601bd61bf95761832dfc88764da/Bottleneck-1.4.1-cp311-cp311-win_amd64.whl.metadata
  Downloading Bottleneck-1.4.1-cp311-cp311-win_amd64.whl.metadata (8.1 kB)
Downloading Bottleneck-1.4.1-cp311-cp311-win_amd64.whl (111 kB)
   ---------------------------------------- 0.0/111.6 kB ? eta -:--:--
   ---------------------------------------- 111.6/111.6 kB 3.3 MB/s eta 0:00:00
Installing collected packages: bottleneck
  Attempting uninstall: bottleneck
    Found existing installation: Bottleneck 1.3.5
    Uninstalling Bottleneck-1.3.5:
      Successfully uninstalled Bottleneck-1.3.5
Successfully installed bottleneck-1.4.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import subprocess
import os

# Function to run the download.py script with the appropriate parameters
def download_yt8m_shards(dataset_type='train', partition='2/frame', mirror='us', shards=''):
    """
    Downloads YouTube-8M dataset shards using the local download.py script.
    
    Args:
        dataset_type (str): Can be 'train', 'validate', or 'test'.
        partition (str): The partition path ('2/frame' for frame-level features).
        mirror (str): The mirror to use for downloading (e.g., 'us', 'eu', or 'asia').
        shards (str): Specifies the shard range to download (e.g., 'shard=1,1000').
    """
    
    # Set environment variables for partition and mirror
    os.environ['partition'] = partition + '/' + dataset_type
    os.environ['mirror'] = mirror
    
    # Adjust command to include the shard restriction (e.g., download the first 3 shards)
    base_command = f'python download.py --partition={partition}/{dataset_type} --mirror={mirror} {shards}'
    
    try:
        print(f"Starting download of {dataset_type} data... (Downloading {shards})")
        result = subprocess.run(base_command, shell=True, capture_output=True, text=True)
        
        # Print the standard output and error
        if result.stdout:
            print(f"Standard Output:\n{result.stdout}")
        if result.stderr:
            print(f"Standard Error:\n{result.stderr}")
        
        result.check_returncode()  # Check if the command was successful
        print(f"Successfully downloaded {dataset_type} data!")
    
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while downloading {dataset_type} data: {e}")
        print(f"Error Output: {e.stderr}")

# Set the directory where the download.py script is located
data_dir = os.path.expanduser("~/data/yt8m/frame")
os.makedirs(data_dir, exist_ok=True)
os.chdir(data_dir)

# Download 3 shards (around 1.2 GB total) for training, validation, and test sets
download_yt8m_shards(dataset_type='train', partition='2/frame', mirror='us', shards='shard=1,1000')
download_yt8m_shards(dataset_type='train', partition='2/frame', mirror='us', shards='shard=2,1000')
download_yt8m_shards(dataset_type='train', partition='2/frame', mirror='us', shards='shard=3,1000')

download_yt8m_shards(dataset_type='validate', partition='2/frame', mirror='us', shards='shard=1,1000')
download_yt8m_shards(dataset_type='test', partition='2/frame', mirror='us', shards='shard=1,1000')


Starting download of train data... (Downloading shard=1,1000)


In [None]:
import subprocess
import os

# Function to run the download.py script with the appropriate parameters
def download_yt8m_shards(dataset_type='validate', partition='2/frame', mirror='us', shards=''):
    """
    Downloads YouTube-8M dataset shards using the local download.py script.
    
    Args:
        dataset_type (str): Can be 'train', 'validate', or 'test'.
        partition (str): The partition path ('2/frame' for frame-level features).
        mirror (str): The mirror to use for downloading (e.g., 'us', 'eu', or 'asia').
        shards (str): Specifies the shard range to download (e.g., 'shard=1,1000').
    """
    
    # Set environment variables for partition and mirror
    os.environ['partition'] = partition + '/' + dataset_type
    os.environ['mirror'] = mirror
    
    # Adjust command to include the shard restriction
    base_command = f'python download.py --partition={partition}/{dataset_type} --mirror={mirror} {shards}'
    
    try:
        print(f"Starting download of {dataset_type} data... (Downloading {shards})")
        result = subprocess.run(base_command, shell=True, capture_output=True, text=True)
        
        # Print the standard output and error
        if result.stdout:
            print(f"Standard Output:\n{result.stdout}")
        if result.stderr:
            print(f"Standard Error:\n{result.stderr}")
        
        result.check_returncode()  # Check if the command was successful
        print(f"Successfully downloaded {dataset_type} data!")
    
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while downloading {dataset_type} data: {e}")
        print(f"Error Output: {e.stderr}")

# Set the directory where the download.py script is located
data_dir = os.path.expanduser("~/data/yt8m/frame")
os.makedirs(data_dir, exist_ok=True)
os.chdir(data_dir)

# Download 3 shards for validation data
download_yt8m_shards(dataset_type='test', partition='2/frame', mirror='us', shards='shard=1,1000')
download_yt8m_shards(dataset_type='test', partition='2/frame', mirror='us', shards='shard=2,1000')
download_yt8m_shards(dataset_type='test', partition='2/frame', mirror='us', shards='shard=3,1000')


Starting download of test data... (Downloading shard=1,1000)


In [10]:
import tensorflow as tf

# Define feature description for parsing TFRecord files
feature_description = {
    'video_id': tf.io.FixedLenFeature([], tf.string),
    'labels': tf.io.VarLenFeature(tf.int64),  # Sparse, multi-labels
    'mean_rgb': tf.io.FixedLenFeature([1024], tf.float32, default_value=[0.0]*1024),  # RGB features
    'mean_audio': tf.io.FixedLenFeature([128], tf.float32, default_value=[0.0]*128)  # Audio features
}

# Parsing function for TFRecord files
def _parse_function(proto):
    return tf.io.parse_single_example(proto, feature_description)

# Extract features and labels from the parsed record
def extract_features_labels(record):
    features = {
        'mean_rgb': record['mean_rgb'],
        'mean_audio': record['mean_audio']
    }
    # Convert sparse labels to dense, use -1 for missing labels
    labels = tf.sparse.to_dense(record['labels'], default_value=-1)
    return features, labels

# Function to load the dataset
def load_dataset(file_pattern, batch_size=32, limit=None):
    """
    Load and preprocess the dataset.
    
    :param file_pattern: Path pattern for the dataset (e.g., '*.tfrecord')
    :param batch_size: Batch size for training
    :param limit: Limit number of records for testing, if None all data is used
    :return: TensorFlow dataset
    """
    raw_dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(file_pattern))
    parsed_dataset = raw_dataset.map(_parse_function)
    dataset = parsed_dataset.map(extract_features_labels)
    
    if limit:
        dataset = dataset.take(limit)  # Limit the dataset to a few records
    
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Load datasets with a limit for faster experimentation
train_dataset = load_dataset('~/data/yt8m/frame/train/*.tfrecord', batch_size=32, limit=5)
validate_dataset = load_dataset('~/data/yt8m/frame/validate/*.tfrecord', batch_size=32, limit=5)
test_dataset = load_dataset('~/data/yt8m/frame/test/*.tfrecord', batch_size=32, limit=5)

# Inspect the first batch of training data
for features, labels in train_dataset.take(1):
    print("Mean RGB shape:", features['mean_rgb'].shape)
    print("Mean Audio shape:", features['mean_audio'].shape)
    print("Labels:", labels.numpy())

