## 1. Importing libraries

In [38]:
import os
import pandas as pd
import pickle

DATA_FOLDER = "/home/yeray142/Documents/projects/multimodal-exercise/data/dataset"

## 2. Reading data

In [39]:
age_dev = pd.read_csv('/home/yeray142/Documents/projects/multimodal-exercise/data/annotations/fi_age_labels/age_anno_dev.csv')
age_test = pd.read_csv('/home/yeray142/Documents/projects/multimodal-exercise/data/annotations/fi_age_labels/age_anno_test.csv')

In [40]:
age_dev.head()

Unnamed: 0,VideoName,YouTubeID,AgeGroup
0,--Ymqszjv54.001.mp4,--Ymqszjv54,5
1,--Ymqszjv54.003.mp4,--Ymqszjv54,5
2,--Ymqszjv54.004.mp4,--Ymqszjv54,5
3,--Ymqszjv54.005.mp4,--Ymqszjv54,5
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2


In [41]:
test_videos = os.path.join(DATA_FOLDER, 'test')
train_videos = os.path.join(DATA_FOLDER, 'train')
validation_videos = os.path.join(DATA_FOLDER, 'validation')

In [42]:
# Load transcriptions pickle file
with open(os.path.join('/home/yeray142/Documents/projects/multimodal-exercise/data/annotations/transcription_test.pkl'), 'rb') as f:
    transcriptions_test = pickle.load(f)

with open(os.path.join('/home/yeray142/Documents/projects/multimodal-exercise/data/annotations/transcription_training.pkl'), 'rb') as f:
    transcriptions_train = pickle.load(f)

with open(os.path.join('/home/yeray142/Documents/projects/multimodal-exercise/data/annotations/transcription_validation.pkl'), 'rb') as f:
    transcriptions_validation = pickle.load(f)

The following function will be used to get the video names from the folders:

In [43]:
# Loop through the folders and get the video names
def get_video_names(folder):
    return [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

test_videos = get_video_names(test_videos)
train_videos = get_video_names(train_videos)
validation_videos = get_video_names(validation_videos)
print(test_videos[:5])

['VTv4BAYgJpk.000.mp4', 'Lz3hYPF6aIM.002.mp4', 'U-XrzfFzMkg.001.mp4', '53QFyec0uN0.000.mp4', '-N6QKrbnaDs.001.mp4']


Let's check that all videos in the folders are in the age_dev and age_test dataframes:

In [44]:
def check_videos_in_df(videos, df):
    for video in videos:
        if video not in df['VideoName'].values:
            print(f"{video} not in df")

check_videos_in_df(test_videos, age_test)
check_videos_in_df(train_videos, age_dev)
check_videos_in_df(validation_videos, age_dev)

metadata.csv not in df
metadata.csv not in df
metadata.csv not in df


## 3. Split dev into train and validation based on the folders

In [45]:
# Split dev into train and validation based on the folders
age_dev['split'] = 'train'
age_dev.loc[age_dev['VideoName'].isin(validation_videos), 'split'] = 'validation'
age_dev['split'].value_counts()

split
train         6000
validation    2000
Name: count, dtype: int64

In [46]:
age_dev.head()

Unnamed: 0,VideoName,YouTubeID,AgeGroup,split
0,--Ymqszjv54.001.mp4,--Ymqszjv54,5,train
1,--Ymqszjv54.003.mp4,--Ymqszjv54,5,train
2,--Ymqszjv54.004.mp4,--Ymqszjv54,5,train
3,--Ymqszjv54.005.mp4,--Ymqszjv54,5,train
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2,train


In [47]:
# Split the dataframes into different ones
age_train = age_dev[age_dev['split'] == 'train']
age_validation = age_dev[age_dev['split'] == 'validation']

In [48]:
# Remove the split column
age_train = age_train.drop(columns=['split'])
age_validation = age_validation.drop(columns=['split'])
age_train.head()

Unnamed: 0,VideoName,YouTubeID,AgeGroup
0,--Ymqszjv54.001.mp4,--Ymqszjv54,5
1,--Ymqszjv54.003.mp4,--Ymqszjv54,5
2,--Ymqszjv54.004.mp4,--Ymqszjv54,5
3,--Ymqszjv54.005.mp4,--Ymqszjv54,5
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2


In [49]:
# Function to add transcriptions to a dataframe
def add_transcriptions(df, transcriptions_dict):
    # Create a new column initialized with empty strings
    df['transcription'] = ''

    # Loop through each row in the dataframe
    for idx, row in df.iterrows():
        video_name = row['VideoName']
        # Check if the video name exists in the transcriptions dictionary
        if video_name in transcriptions_dict:
            df.at[idx, 'transcription'] = transcriptions_dict[video_name]
        else:
	        print(f"{video_name} not in transcriptions_dict")
    return df

# Add transcriptions to each dataframe
age_train = add_transcriptions(age_train, transcriptions_train)
age_validation = add_transcriptions(age_validation, transcriptions_validation)
age_test = add_transcriptions(age_test, transcriptions_test)

# Check if transcriptions were added
print(f"Train transcriptions added: {age_train['transcription'].notna().sum()}/{len(age_train)}")
print(f"Validation transcriptions added: {age_validation['transcription'].notna().sum()}/{len(age_validation)}")
print(f"Test transcriptions added: {age_test['transcription'].notna().sum()}/{len(age_test)}")

Train transcriptions added: 6000/6000
Validation transcriptions added: 2000/2000
Test transcriptions added: 2000/2000


In [50]:
age_train.head()

Unnamed: 0,VideoName,YouTubeID,AgeGroup,transcription
0,--Ymqszjv54.001.mp4,--Ymqszjv54,5,I like Tabasco sauce. I like Louisiana Hot Sau...
1,--Ymqszjv54.003.mp4,--Ymqszjv54,5,Lot more things there. Then the menus are a lo...
2,--Ymqszjv54.004.mp4,--Ymqszjv54,5,There you go and red. See? [yelling 00:00:04]....
3,--Ymqszjv54.005.mp4,--Ymqszjv54,5,This doesn't have a name on it. This is from O...
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2,I'm thinking with how much time and energy I'm...


## 4. Mapping column names

We will rename the columns to match the ones in the metadata file for Hugging Face:

In [51]:
rename_mapping = {
	'VideoName': 'file_name',
	'YouTubeID': 'video_id',
    'AgeGroup': 'age_group'
}

age_train.rename(columns=rename_mapping, inplace=True)
age_validation.rename(columns=rename_mapping, inplace=True)
age_test.rename(columns=rename_mapping, inplace=True)
age_train.head()

Unnamed: 0,file_name,video_id,age_group,transcription
0,--Ymqszjv54.001.mp4,--Ymqszjv54,5,I like Tabasco sauce. I like Louisiana Hot Sau...
1,--Ymqszjv54.003.mp4,--Ymqszjv54,5,Lot more things there. Then the menus are a lo...
2,--Ymqszjv54.004.mp4,--Ymqszjv54,5,There you go and red. See? [yelling 00:00:04]....
3,--Ymqszjv54.005.mp4,--Ymqszjv54,5,This doesn't have a name on it. This is from O...
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2,I'm thinking with how much time and energy I'm...


Finally, let's save the dataframes to csv files:

In [33]:
age_train.to_csv("/home/yeray142/Documents/projects/multimodal-exercise/data/dataset/train/metadata.csv", index=False)
age_validation.to_csv("/home/yeray142/Documents/projects/multimodal-exercise/data/dataset/validation/metadata.csv", index=False)
age_test.to_csv("/home/yeray142/Documents/projects/multimodal-exercise/data/dataset/test/metadata.csv", index=False)

## 5. Push to Hugging Face Hub

In [34]:
from datasets import load_dataset

dataset = load_dataset("videofolder", data_dir="/home/yeray142/Documents/projects/multimodal-exercise/data/dataset")

Resolving data files:   0%|          | 0/6001 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2001 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2001 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/6001 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/2001 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/2001 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [35]:
dataset

DatasetDict({
    train: Dataset({
        features: ['video', 'video_id', 'age_group', 'transcription'],
        num_rows: 6000
    })
    validation: Dataset({
        features: ['video', 'video_id', 'age_group', 'transcription'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['video', 'video_id', 'age_group', 'transcription'],
        num_rows: 2000
    })
})

In [36]:
dataset['train'][0]['video']

<torchvision.io.video_reader.VideoReader at 0x7db38324e3f0>

In [37]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_large_folder(
    folder_path="/home/yeray142/Documents/projects/multimodal-exercise/data/dataset",
    repo_id="yeray142/first-impressions-v2",
    repo_type="dataset",
)

Recovering from metadata files:   0%|          | 0/10003 [00:00<?, ?it/s]




---------- 2025-03-25 11:16:43 (0:00:00) ----------
Files:   hashed 10002/10003 (29.4G/29.5G) | pre-uploaded: 10000/10000 (29.4G/29.5G) (+3 unsure) | committed: 10000/10003 (29.4G/29.5G) | ignored: 0
Workers: hashing: 1 | get upload mode: 2 | pre-uploading: 0 | committing: 0 | waiting: 11
---------------------------------------------------
