## Download training data (class labels and MPCC features extracted from raw audio)
#### Only need to run this code once
The unbalanced_train_segments is probably the most important since the balanced_train_segments and eval_segments only have >59 examples of each class

In [1]:
# !wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
# !wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv
# !wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv
# !wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
# !wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/features/features.tar.gz
# !tar xvzf features.tar.gz

### Figuring out what I just downloaded

Hypothesis: Each TFRecord contains data for a set of videos. 

Example: nE.tfrecord contains data for all the youtube videos whose IDs start with "nE"

For each of those videos, there is a sequence example which contains a 128 length feature vector for each second of data (so there are 128x10=1280 features). I also think that for each video there is a set of labels that describe the whole video (i.e. this 10-second clip contains laughter and crying) 

In [2]:
import tensorflow as tf

filename = 'audioset_v1_embeddings/unbal_train/nE.tfrecord'
filenames = [filename]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

2024-06-30 03:00:19.812048: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-30 03:00:19.812891: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-30 03:00:19.815367: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-30 03:00:19.822661: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 03:00:19.841702: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [3]:
import numpy as np

# for raw_record in raw_dataset.take(1):
for raw_record in raw_dataset.take(100):
    example = tf.train.SequenceExample()
    example.ParseFromString(raw_record.numpy())

    # print(example)
    # for i in range(10):
    #     print(f"{i}: {np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0],np.uint8).astype(np.float32)}")
    # context = [example.context.feature[i] for i in range(4)]
    # print(len(example.feature_lists.feature_list['audio_embedding']))
    if len(example.feature_lists.feature_list['audio_embedding'].feature) != 10:
        print(example)
        break
    # print(len(example.feature_lists.feature_list['audio_embedding'].feature))
    # print(example.context)
    # print(example.context.feature["video_id"].bytes_list.value[0])
    # print(example.context.feature["start_time_seconds"].float_list.value[0])
    # print(example.context.feature["labels"].int64_list.value)
    # print(example.context.feature["end_time_seconds"].float_list.value[0])

context {
  feature {
    key: "video_id"
    value {
      bytes_list {
        value: "nE-qCFeZaCw"
      }
    }
  }
  feature {
    key: "start_time_seconds"
    value {
      float_list {
        value: 0
      }
    }
  }
  feature {
    key: "labels"
    value {
      int64_list {
        value: 0
        value: 300
        value: 316
      }
    }
  }
  feature {
    key: "end_time_seconds"
    value {
      float_list {
        value: 6
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "audio_embedding"
    value {
      feature {
        bytes_list {
          value: "2jt\013\342a\217z\226\231\223\356\247\205\346\233/\210Z\243\250\204\214\207\242\224\260P\230@u}|\377\233.\265\\K\277\246\225\320uZ*\005e\366Nd+{\334NJ8\212\377t\316A\310\231i\2279\254s\247\235U\301\026?\313<^\207{\0107\312eNa\216}\210N$\227\264\260\000 j\307\264j\302\327\301@n\210\264k{\255\377\000M\264\215\207\000\377\377d_4\377p)\377\205\341"
        }
      }
      feature {
        bytes_list {
 

In [4]:
raw_dataset.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=-2>

In [5]:
def parse_feature_data(raw_dataset):
    """
    This function takes the raw dataset and produces a table that contains the raw 128x10 features and a column that is the presence of laughter (0 or 1)

    Note: we will skip videos that aren't 10 seconds long
    """
    laugh_labels = [16, 17, 18, 19, 20, 21]

    extracted_table = []
    
    for raw_record in raw_dataset:
        example = tf.train.SequenceExample()
        example.ParseFromString(raw_record.numpy())

        if len(example.feature_lists.feature_list['audio_embedding'].feature) != 10:
            continue

        audio_features_10s = [np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0],np.uint8).astype(np.float32) for i in range(10)]
        # audio_features = np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[0].bytes_list.value[0],np.uint8).astype(np.float32)
        # video_id = example.context.feature["video_id"].bytes_list.value[0]
        # start_time_secs = example.context.feature["start_time_seconds"].float_list.value[0]
        labels = example.context.feature["labels"].int64_list.value
        # end_time_secs = example.context.feature["end_time_seconds"].float_list.value[0]

        has_laughter = any([label in laugh_labels for label in labels])

        flat_audio_features = [j for sub in audio_features_10s for j in sub]

        extracted_data = flat_audio_features + [1 if has_laughter else 0]
        extracted_table.append(extracted_data)

    return extracted_table
    

In [6]:
extracted_table_nE = parse_feature_data(raw_dataset)

2024-06-30 03:00:22.253740: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
np_table_nE = np.array(extracted_table_nE)

In [8]:
np_table_nE.shape

(381, 1281)

In [9]:
np_table_nE[:, 1280].sum()

2.0

The above cell shows that in the nE.tfrecord (all videos starting with "nE") there are only *two* instances of laughter

### I've successfully extracted a table of data from the nE.tfrecord (shape: 381x1281). Now all I have to do is repeat this process for all of the TFRecords and I will have my whole dataset.

In [10]:
from pathlib import Path

home_path = Path('/home/vedrau/work/laugh_track')
unbal_audio_features_path = home_path / 'audioset_v1_embeddings' / 'unbal_train'

In [11]:
unbal_filenames = [str(e) for e in unbal_audio_features_path.iterdir()]

In [12]:
unbal_filenames

['/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/lO.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/Zj.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/Yk.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/Xg.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/I-.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/1T.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/Br.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/Ig.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/fU.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/Ma.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/kT.tfrecord',
 '/home/vedrau/work/laugh_track/audioset_v1_embeddings/unbal_train/GL.tfrecord',
 '/home/vedrau/work/laugh_tr

In [13]:
test_dataset = tf.data.TFRecordDataset(unbal_filenames[:2])

In [14]:
test_dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [15]:
test_extracted_table = parse_feature_data(test_dataset)

2024-06-30 03:00:22.690566: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [16]:
np_test_table = np.array(test_extracted_table)

In [17]:
np_test_table.shape

(838, 1281)

In [18]:
np_test_table[:,1280].sum()

3.0

## Creating the final data product using all of the TFRecords (starting with bal_train)

In [19]:
bal_audio_features_path = home_path / 'audioset_v1_embeddings' / 'bal_train'

In [20]:
bal_filenames = [str(e) for e in bal_audio_features_path.iterdir()]

In [21]:
all_dataset = tf.data.TFRecordDataset(filenames)

In [22]:
all_extracted_table = parse_feature_data(all_dataset)

2024-06-30 03:00:23.021522: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [23]:
len(bal_filenames), len(unbal_filenames)

(4070, 4096)

The way to load only the balanced train data is to load the balanced_train_segments.csv and only read the videos whose IDs are in the CSV. That data is guaranteed to include at least 59 examples of laughter (although I think more like 59*2.5 examples since there are 5 laugh labels).

In [24]:
import pandas as pd
bal_train_segs = pd.read_csv('balanced_train_segments.csv',header=2, quotechar=r'"',skipinitialspace=True)

In [25]:
bal_train_segs.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels
0,--PJHxphWEs,30.0,40.0,"/m/09x0r,/t/dd00088"
1,--ZhevVpy1s,50.0,60.0,/m/012xff
2,--aE2O5G5WE,0.0,10.0,"/m/03fwl,/m/04rlf,/m/09x0r"
3,--aO5cdqSAg,30.0,40.0,"/t/dd00003,/t/dd00005"
4,--aaILOrkII,200.0,210.0,"/m/032s66,/m/073cg4"


In [39]:
def parse_laughter_feature_data(dataset):
    """
    This function takes the dataset and produces a table of the laughter data (the raw 128x10 features and a column that is the presence of laughter)

    Note: we will skip videos that aren't 10 seconds long
    """
    laugh_labels = [16, 17, 18, 19, 20, 21]

    extracted_table = []
    
    for record in dataset:
        example = tf.train.SequenceExample()
        example.ParseFromString(record.numpy())

        if len(example.feature_lists.feature_list['audio_embedding'].feature) != 10:
            continue

        audio_features_10s = [np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0],np.uint8).astype(np.float32) for i in range(10)]
        # audio_features = np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[0].bytes_list.value[0],np.uint8).astype(np.float32)
        # video_id = example.context.feature["video_id"].bytes_list.value[0]
        # start_time_secs = example.context.feature["start_time_seconds"].float_list.value[0]
        labels = example.context.feature["labels"].int64_list.value
        # end_time_secs = example.context.feature["end_time_seconds"].float_list.value[0]

        flat_audio_features = [j for sub in audio_features_10s for j in sub]
        
        has_laughter: bool = any([label in laugh_labels for label in labels])
        if has_laughter:
            # this should always be appending ones
            extracted_data = flat_audio_features + [1 if has_laughter else 0]
            extracted_table.append(extracted_data)

    return extracted_table


In [27]:
unbal_dataset = tf.data.TFRecordDataset(unbal_filenames)

In [28]:
laughter_features = parse_laughter_feature_data(unbal_dataset)

2024-06-30 03:07:25.371927: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [29]:
np_laughter_features = np.array(laughter_features)

In [30]:
np_laughter_features.shape

(9215, 1281)

In [31]:
np.save('unbal_laughter_features.npy', np_laughter_features)

### Downsample the total laughter features

In [37]:
num_rows = 100
idxs = np.random.choice(list(range(9215)), size=num_rows, replace=False)

In [38]:
np_laughter_features[idxs].shape

(100, 1281)

In [40]:
def parse_non_laughter_feature_data(dataset, num_rows):
    """
    This function takes the dataset and produces a table of the non-laughter data (the raw 128x10 features and a column that is the presence of laughter)

    Note: we will skip videos that aren't 10 seconds long
    """
    laugh_labels = [16, 17, 18, 19, 20, 21]

    extracted_table = []
    
    for record in dataset:
        # we only want to extract num_rows rows
        if len(extracted_table) >= num_rows:
            break
            
        example = tf.train.SequenceExample()
        example.ParseFromString(record.numpy())

        if len(example.feature_lists.feature_list['audio_embedding'].feature) != 10:
            continue

        labels = example.context.feature["labels"].int64_list.value
        has_laughter: bool = any([label in laugh_labels for label in labels])
        if not has_laughter:
            audio_features_10s = [np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0],np.uint8).astype(np.float32) for i in range(10)]
            # audio_features = np.frombuffer(example.feature_lists.feature_list['audio_embedding'].feature[0].bytes_list.value[0],np.uint8).astype(np.float32)
            # video_id = example.context.feature["video_id"].bytes_list.value[0]
            # start_time_secs = example.context.feature["start_time_seconds"].float_list.value[0]
            
            # end_time_secs = example.context.feature["end_time_seconds"].float_list.value[0]
    
            flat_audio_features = [j for sub in audio_features_10s for j in sub]

            # this should always be appending zeroes
            extracted_data = flat_audio_features + [1 if has_laughter else 0]
            extracted_table.append(extracted_data)


    return extracted_table

In [41]:
non_laughter_features = parse_non_laughter_feature_data(unbal_dataset, num_rows)

In [43]:
np_non_laughter_features = np.array(non_laughter_features)

In [44]:
np_non_laughter_features.shape

(100, 1281)

In [45]:
np.save('non_laughter_features_100.npy', np_non_laughter_features)

In [46]:
np_laughter_features_100 = np_laughter_features[idxs]

In [48]:
np.save('laughter_features_100.npy', np_laughter_features_100)

### Create a CSV of the laughter and non-laughter data

In [49]:
np_laughter_features_100.shape

(100, 1281)

In [51]:
np_non_laughter_features.shape

(100, 1281)

In [53]:
all_features = np.concatenate((np_laughter_features_100, np_non_laughter_features))

In [65]:
# I have a 200 x 1281 numpy array. I want to get a CSV with 200 rows and 1281 columns
data_dict = {}
for i in range(1280):
    data_dict[f"x_{i}"] = list(all_features[:, i])
data_dict["y"] = list(all_features[:, 1280])

In [66]:
df = pd.DataFrame.from_dict(data_dict)

In [72]:
df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_1271,x_1272,x_1273,x_1274,x_1275,x_1276,x_1277,x_1278,x_1279,y
0,135.0,124.0,180.0,116.0,238.0,62.0,109.0,31.0,155.0,149.0,...,69.0,166.0,87.0,149.0,25.0,255.0,71.0,101.0,42.0,1.0
1,135.0,103.0,255.0,187.0,137.0,156.0,113.0,30.0,117.0,113.0,...,255.0,200.0,72.0,176.0,99.0,225.0,29.0,0.0,148.0,1.0
2,0.0,133.0,193.0,8.0,69.0,255.0,57.0,141.0,117.0,110.0,...,143.0,91.0,125.0,62.0,177.0,172.0,0.0,243.0,0.0,1.0
3,43.0,63.0,160.0,15.0,77.0,159.0,4.0,91.0,206.0,146.0,...,215.0,129.0,199.0,255.0,113.0,36.0,43.0,0.0,197.0,1.0
4,119.0,78.0,166.0,95.0,175.0,91.0,63.0,61.0,227.0,89.0,...,63.0,195.0,61.0,197.0,70.0,80.0,0.0,0.0,220.0,1.0


In [74]:
df.to_csv('tabular_data_100_features.csv', index=False)