In [1]:
import json
import pandas as pd
import os
from random import seed, sample
import shutil

In [2]:
def load_metadata(metadata_path):
    """Load metadata from a JSON file and return a DataFrame."""
    with open(metadata_path) as f:
        metadata = json.load(f)
    df = pd.DataFrame.from_dict(metadata, orient='index')
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'video_filename'}, inplace=True)
    return df

In [14]:
# Part 0
part0_path = '../../data/dfdc_train_part_11/metadata.json'
part0_df = load_metadata(part0_path)
part0_df

Unnamed: 0,video_filename,label,split,original
0,qgqsgtekwl.mp4,FAKE,train,ztutsnlhtr.mp4
1,bahpguunin.mp4,FAKE,train,frczmdfzza.mp4
2,qekttcqtpm.mp4,FAKE,train,zesnphumru.mp4
3,zxuqoykuqj.mp4,FAKE,train,dovdrtvmbx.mp4
4,ztfjilznzu.mp4,REAL,train,
...,...,...,...,...
2113,nrsqnrcidv.mp4,FAKE,train,thfzywypts.mp4
2114,hhmflhhbih.mp4,FAKE,train,jbgklqgife.mp4
2115,lkgamwbygt.mp4,FAKE,train,eipwxqahgp.mp4
2116,rgmllhylbj.mp4,FAKE,train,usnyozppis.mp4


In [15]:
# Part 1
part1_path = '../../data/dfdc_train_part_12/metadata.json'
part1_df = load_metadata(part1_path)
part1_df

Unnamed: 0,video_filename,label,split,original
0,ufucquemsr.mp4,FAKE,train,ioxuwhhvge.mp4
1,hzpeccyres.mp4,FAKE,train,guupwbnbzc.mp4
2,qlrzbtcbbh.mp4,FAKE,train,erufbzxadr.mp4
3,voztvddhnf.mp4,FAKE,train,pejtahlmir.mp4
4,hollmfvplq.mp4,FAKE,train,mpbvaumtlo.mp4
...,...,...,...,...
2220,zgvjfpzubp.mp4,REAL,train,
2221,pnxmphjyyo.mp4,REAL,train,
2222,oathlhqdnz.mp4,FAKE,train,wcdmpwfooh.mp4
2223,oexoavedkb.mp4,FAKE,train,zgvjfpzubp.mp4


In [16]:
# Part 2
part2_path = '../../data/dfdc_train_part_13/metadata.json'
part2_df = load_metadata(part2_path)
part2_df

Unnamed: 0,video_filename,label,split,original
0,wxiboojdnq.mp4,FAKE,train,vncfwkhxmd.mp4
1,ojdrqflslr.mp4,FAKE,train,yaivbfubpr.mp4
2,dewhpxurks.mp4,FAKE,train,gakrxuhtfw.mp4
3,hcyyjbngyp.mp4,FAKE,train,zjcicdzech.mp4
4,jeejkkargi.mp4,FAKE,train,khybkobcmj.mp4
...,...,...,...,...
3689,vlrkfbqajs.mp4,REAL,train,
3690,mdampijxqe.mp4,REAL,train,
3691,plboestduy.mp4,REAL,train,
3692,radyyavwaa.mp4,FAKE,train,galckcdeka.mp4


In [5]:
# Part 3
part3_path = '../../data/dfdc_train_part_8/metadata.json'
part3_df = load_metadata(part3_path)
part3_df

Unnamed: 0,video_filename,label,split,original
0,laihwpwiee.mp4,FAKE,train,aeysrmfmht.mp4
1,kzkqjyioqf.mp4,FAKE,train,ighgptshjh.mp4
2,gbfhqkesql.mp4,FAKE,train,dplfintlwc.mp4
3,ahrigbsxpt.mp4,FAKE,train,nonwqkyict.mp4
4,mwifzhcxkx.mp4,FAKE,train,vqybdfadtl.mp4
...,...,...,...,...
2468,vbyjwspnie.mp4,REAL,train,
2469,suleggscox.mp4,REAL,train,
2470,qytjmqkfhh.mp4,FAKE,train,vbyjwspnie.mp4
2471,oavrralgxd.mp4,FAKE,train,mbxblocpyw.mp4


In [7]:
# Part 4
part4_path = '../../data/dfdc_train_part_4/metadata.json'
part4_df = load_metadata(part4_path)
part4_df

Unnamed: 0,video_filename,label,split,original
0,oyixebfpcl.mp4,FAKE,train,bgpoldvzrh.mp4
1,sayyjwtjol.mp4,REAL,train,
2,qxbnmwwgzp.mp4,FAKE,train,kmyrnchata.mp4
3,puudvugxrl.mp4,REAL,train,
4,shclgsfxtj.mp4,FAKE,train,bgpoldvzrh.mp4
...,...,...,...,...
1696,midpeunjaz.mp4,FAKE,train,xjabnijtoe.mp4
1697,cdawymhaak.mp4,FAKE,train,xxbunqmupn.mp4
1698,kyxmtfyceu.mp4,FAKE,train,hvmcslpaeu.mp4
1699,aahncigwte.mp4,REAL,train,


In [3]:
def append_full_path(df, directory_path):
    """
    Adds the full path to each video based on 'video_filename'.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'video_filename' column.
    directory_path (str): Directory path where the videos are located.

    Returns:
    pd.DataFrame: DataFrame with a new 'full_path' column.
    """
    # Normalize the directory path
    directory_path = os.path.normpath(directory_path)

    # Add the full path to each video based on 'video_filename'
    df['full_path'] = df['video_filename'].apply(lambda x: os.path.normpath(os.path.join(directory_path, x)))
    return df

In [17]:
part0_df = append_full_path(part0_df, "../../data/dfdc_train_part_11")
part0_df

Unnamed: 0,video_filename,label,split,original,full_path
0,qgqsgtekwl.mp4,FAKE,train,ztutsnlhtr.mp4,..\..\data\dfdc_train_part_11\qgqsgtekwl.mp4
1,bahpguunin.mp4,FAKE,train,frczmdfzza.mp4,..\..\data\dfdc_train_part_11\bahpguunin.mp4
2,qekttcqtpm.mp4,FAKE,train,zesnphumru.mp4,..\..\data\dfdc_train_part_11\qekttcqtpm.mp4
3,zxuqoykuqj.mp4,FAKE,train,dovdrtvmbx.mp4,..\..\data\dfdc_train_part_11\zxuqoykuqj.mp4
4,ztfjilznzu.mp4,REAL,train,,..\..\data\dfdc_train_part_11\ztfjilznzu.mp4
...,...,...,...,...,...
2113,nrsqnrcidv.mp4,FAKE,train,thfzywypts.mp4,..\..\data\dfdc_train_part_11\nrsqnrcidv.mp4
2114,hhmflhhbih.mp4,FAKE,train,jbgklqgife.mp4,..\..\data\dfdc_train_part_11\hhmflhhbih.mp4
2115,lkgamwbygt.mp4,FAKE,train,eipwxqahgp.mp4,..\..\data\dfdc_train_part_11\lkgamwbygt.mp4
2116,rgmllhylbj.mp4,FAKE,train,usnyozppis.mp4,..\..\data\dfdc_train_part_11\rgmllhylbj.mp4


In [18]:
part1_df = append_full_path(part1_df, "../../data/dfdc_train_part_12")
part1_df

Unnamed: 0,video_filename,label,split,original,full_path
0,ufucquemsr.mp4,FAKE,train,ioxuwhhvge.mp4,..\..\data\dfdc_train_part_12\ufucquemsr.mp4
1,hzpeccyres.mp4,FAKE,train,guupwbnbzc.mp4,..\..\data\dfdc_train_part_12\hzpeccyres.mp4
2,qlrzbtcbbh.mp4,FAKE,train,erufbzxadr.mp4,..\..\data\dfdc_train_part_12\qlrzbtcbbh.mp4
3,voztvddhnf.mp4,FAKE,train,pejtahlmir.mp4,..\..\data\dfdc_train_part_12\voztvddhnf.mp4
4,hollmfvplq.mp4,FAKE,train,mpbvaumtlo.mp4,..\..\data\dfdc_train_part_12\hollmfvplq.mp4
...,...,...,...,...,...
2220,zgvjfpzubp.mp4,REAL,train,,..\..\data\dfdc_train_part_12\zgvjfpzubp.mp4
2221,pnxmphjyyo.mp4,REAL,train,,..\..\data\dfdc_train_part_12\pnxmphjyyo.mp4
2222,oathlhqdnz.mp4,FAKE,train,wcdmpwfooh.mp4,..\..\data\dfdc_train_part_12\oathlhqdnz.mp4
2223,oexoavedkb.mp4,FAKE,train,zgvjfpzubp.mp4,..\..\data\dfdc_train_part_12\oexoavedkb.mp4


In [19]:
part2_df = append_full_path(part2_df, "../../data/dfdc_train_part_13")
part2_df

Unnamed: 0,video_filename,label,split,original,full_path
0,wxiboojdnq.mp4,FAKE,train,vncfwkhxmd.mp4,..\..\data\dfdc_train_part_13\wxiboojdnq.mp4
1,ojdrqflslr.mp4,FAKE,train,yaivbfubpr.mp4,..\..\data\dfdc_train_part_13\ojdrqflslr.mp4
2,dewhpxurks.mp4,FAKE,train,gakrxuhtfw.mp4,..\..\data\dfdc_train_part_13\dewhpxurks.mp4
3,hcyyjbngyp.mp4,FAKE,train,zjcicdzech.mp4,..\..\data\dfdc_train_part_13\hcyyjbngyp.mp4
4,jeejkkargi.mp4,FAKE,train,khybkobcmj.mp4,..\..\data\dfdc_train_part_13\jeejkkargi.mp4
...,...,...,...,...,...
3689,vlrkfbqajs.mp4,REAL,train,,..\..\data\dfdc_train_part_13\vlrkfbqajs.mp4
3690,mdampijxqe.mp4,REAL,train,,..\..\data\dfdc_train_part_13\mdampijxqe.mp4
3691,plboestduy.mp4,REAL,train,,..\..\data\dfdc_train_part_13\plboestduy.mp4
3692,radyyavwaa.mp4,FAKE,train,galckcdeka.mp4,..\..\data\dfdc_train_part_13\radyyavwaa.mp4


In [12]:
part3_df = append_full_path(part3_df, "../../data/dfdc_train_part_3")
part3_df

Unnamed: 0,video_filename,label,split,original,full_path
0,vngueqqcmz.mp4,FAKE,train,ghfnznqrvh.mp4,..\..\data\dfdc_train_part_3\vngueqqcmz.mp4
1,ieblyetiob.mp4,FAKE,train,tsvhwitvah.mp4,..\..\data\dfdc_train_part_3\ieblyetiob.mp4
2,dnrpknwija.mp4,FAKE,train,ezfrcspzxw.mp4,..\..\data\dfdc_train_part_3\dnrpknwija.mp4
3,zznanrtksj.mp4,FAKE,train,rozrjmrmeo.mp4,..\..\data\dfdc_train_part_3\zznanrtksj.mp4
4,vdwoazrodt.mp4,FAKE,train,kytwyrusew.mp4,..\..\data\dfdc_train_part_3\vdwoazrodt.mp4
...,...,...,...,...,...
1450,hexyjykhvk.mp4,REAL,train,,..\..\data\dfdc_train_part_3\hexyjykhvk.mp4
1451,twgobosowv.mp4,REAL,train,,..\..\data\dfdc_train_part_3\twgobosowv.mp4
1452,ysvogisftd.mp4,FAKE,train,oikhytmnol.mp4,..\..\data\dfdc_train_part_3\ysvogisftd.mp4
1453,hwaxcxpfxl.mp4,REAL,train,,..\..\data\dfdc_train_part_3\hwaxcxpfxl.mp4


In [13]:
part4_df = append_full_path(part4_df, "../../data/dfdc_train_part_4")
part4_df

Unnamed: 0,video_filename,label,split,original,full_path
0,oyixebfpcl.mp4,FAKE,train,bgpoldvzrh.mp4,..\..\data\dfdc_train_part_4\oyixebfpcl.mp4
1,sayyjwtjol.mp4,REAL,train,,..\..\data\dfdc_train_part_4\sayyjwtjol.mp4
2,qxbnmwwgzp.mp4,FAKE,train,kmyrnchata.mp4,..\..\data\dfdc_train_part_4\qxbnmwwgzp.mp4
3,puudvugxrl.mp4,REAL,train,,..\..\data\dfdc_train_part_4\puudvugxrl.mp4
4,shclgsfxtj.mp4,FAKE,train,bgpoldvzrh.mp4,..\..\data\dfdc_train_part_4\shclgsfxtj.mp4
...,...,...,...,...,...
1696,midpeunjaz.mp4,FAKE,train,xjabnijtoe.mp4,..\..\data\dfdc_train_part_4\midpeunjaz.mp4
1697,cdawymhaak.mp4,FAKE,train,xxbunqmupn.mp4,..\..\data\dfdc_train_part_4\cdawymhaak.mp4
1698,kyxmtfyceu.mp4,FAKE,train,hvmcslpaeu.mp4,..\..\data\dfdc_train_part_4\kyxmtfyceu.mp4
1699,aahncigwte.mp4,REAL,train,,..\..\data\dfdc_train_part_4\aahncigwte.mp4


In [10]:
def create_balanced_dataset(dfs, dirs, random_state=None, print_stats=True):
    """
    Creates a balanced dataset by keeping all real videos and selecting the same number 
    of fake videos randomly across multiple input DataFrames. Optionally prints dataset statistics.

    Parameters:
    dfs (list of pd.DataFrame): List of input DataFrames.
    dirs (list of str): List of directory paths corresponding to the DataFrames.
    random_state (int, optional): Seed for reproducibility.
    print_stats (bool, optional): If True, prints label counts and total size.

    Returns:
    pd.DataFrame: A new balanced DataFrame with equal number of real and fake videos.
    """
    if random_state is not None:
        seed(random_state)

    # Concatenate all DataFrames into one
    combined_df = pd.concat(dfs, ignore_index=True)

    # Separate REAL and FAKE videos
    real_videos = combined_df[combined_df['label'] == 'REAL']
    fake_videos = combined_df[combined_df['label'] == 'FAKE']

    # Number of real videos to use for balancing with fake videos
    num_real = len(real_videos)

    # Randomly select the same number of fake videos
    fake_videos_sampled = fake_videos.sample(n=num_real, random_state=random_state)

    # Combine the real and sampled fake videos into a balanced DataFrame
    balanced_df = pd.concat([real_videos, fake_videos_sampled], ignore_index=True)

    # Shuffle the balanced dataset for randomness
    balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Helper function to calculate video size based on the appropriate directory
    def get_video_size(row):
        video_filename = row['video_filename']
        # Determine the correct directory based on the original DataFrame index range
        for i, df in enumerate(dfs):
            if row.name < len(df):
                full_path = os.path.join(dirs[i], video_filename)
                break
        return os.path.getsize(full_path) if os.path.exists(full_path) else 0

    if print_stats:
        # Calculate total size of videos in the balanced DataFrame
        balanced_df['video_size_bytes'] = balanced_df.apply(get_video_size, axis=1)
        total_size_bytes = balanced_df['video_size_bytes'].sum()

        # Print statistics
        label_counts = balanced_df['label'].value_counts()
        print("Label Counts for Balanced DataFrame:")
        print(label_counts.to_string(), "\n")

        # Convert total size to GB
        total_size_gb = total_size_bytes / (1024 ** 3)  # Convert bytes to GB
        print(f"Total size of Balanced DataFrame: {total_size_gb:.2f} GB\n")

    return balanced_df

In [20]:
# dfs = [part0_df, part1_df, part2_df, part3_df, part4_df]
# dirs = ["../../data/dfdc_train_part_0", "../../data/dfdc_train_part_1", "../../data/dfdc_train_part_2", "../../data/dfdc_train_part_3", "../../data/dfdc_train_part_4"]
dfs = [part0_df, part1_df, part2_df]
dirs = ["../../data/dfdc_train_part_0", "../../data/dfdc_train_part_1", "../../data/dfdc_train_part_2"]
balanced_df = create_balanced_dataset(dfs, dirs, random_state=42)

Label Counts for Balanced DataFrame:
label
FAKE    1109
REAL    1109 

Total size of Balanced DataFrame: 0.00 GB



In [21]:
def move_to_balanced_folder_and_create_metadata(balanced_df, destination_dir, metadata_file='metadata.json'):
    """
    Moves video files from the balanced DataFrame to a specified destination directory and creates a metadata JSON file.

    Parameters:
    balanced_df (pd.DataFrame): DataFrame containing video file paths to be moved.
    destination_dir (str): Path to the destination directory where files should be moved.
    metadata_file (str): Name of the metadata JSON file to be created.
    """
    # Create destination directory if it doesn't exist
    os.makedirs(destination_dir, exist_ok=True)

    # Prepare the metadata DataFrame
    metadata_df = balanced_df[['video_filename', 'label', 'original']].copy()

    # Create the metadata JSON file
    metadata_path = os.path.join(destination_dir, metadata_file)
    metadata_df.to_json(metadata_path, orient='records', lines=True)

    print(f"Metadata JSON file created at: {metadata_path}")

    # Move files to the new location
    for _, row in balanced_df.iterrows():
        video_full_path = row['full_path']  # Assuming this already contains the full path

        if os.path.exists(video_full_path):  # Check if the file exists
            # Get the filename
            filename = os.path.basename(video_full_path)
            # Define the new destination path
            new_location = os.path.join(destination_dir, filename)

            # Move the file to the new location
            shutil.move(video_full_path, new_location)
            print(f"Moved: {video_full_path} -> {new_location}")
        else:
            print(f"File not found: {video_full_path}")

In [22]:
destination_folder = "../../data/balanced_dataset"
move_to_balanced_folder_and_create_metadata(balanced_df, destination_folder)

Metadata JSON file created at: ../../data/balanced_dataset\metadata.json
Moved: ..\..\data\dfdc_train_part_12\lbipvzwlut.mp4 -> ../../data/balanced_dataset\lbipvzwlut.mp4
Moved: ..\..\data\dfdc_train_part_12\zverzcapzv.mp4 -> ../../data/balanced_dataset\zverzcapzv.mp4
Moved: ..\..\data\dfdc_train_part_11\grbtvvjvhz.mp4 -> ../../data/balanced_dataset\grbtvvjvhz.mp4
Moved: ..\..\data\dfdc_train_part_12\iylqacirzy.mp4 -> ../../data/balanced_dataset\iylqacirzy.mp4
Moved: ..\..\data\dfdc_train_part_12\djylanzkqq.mp4 -> ../../data/balanced_dataset\djylanzkqq.mp4
Moved: ..\..\data\dfdc_train_part_13\dvseunekna.mp4 -> ../../data/balanced_dataset\dvseunekna.mp4
Moved: ..\..\data\dfdc_train_part_11\fmbmlagoiq.mp4 -> ../../data/balanced_dataset\fmbmlagoiq.mp4
Moved: ..\..\data\dfdc_train_part_12\ffsfxbzlwg.mp4 -> ../../data/balanced_dataset\ffsfxbzlwg.mp4
Moved: ..\..\data\dfdc_train_part_11\nwndlagvxh.mp4 -> ../../data/balanced_dataset\nwndlagvxh.mp4
Moved: ..\..\data\dfdc_train_part_12\eaqxklix