## Step 1: Setup Dataset

In [1]:
import zipfile
import os
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt

# Path to the zip file
zip_file_path = 'train_videos.zip'

# Directory where you want to extract the files
extract_dir = 'train_videos'

if not os.path.isdir(extract_dir):
    # Create the directory if it doesn't exist
    os.makedirs(extract_dir, exist_ok=True)
    
    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents into the specified directory
        zip_ref.extractall(extract_dir)
    
    print(f'Files extracted to {extract_dir}')

Files extracted to train_videos


In [2]:
# create the dataset directory to store data in yolo format
if not os.path.isdir('dataset'):
    os.mkdir('dataset')
    
    os.mkdir('dataset/images')
    os.mkdir('dataset/labels')

    os.mkdir('dataset/images/train')
    os.mkdir('dataset/labels/train')
    
    os.mkdir('dataset/images/val')
    os.mkdir('dataset/labels/val')

train_img_dir = 'dataset/images/train'
train_label_dir = 'dataset/labels/train'

val_img_dir = 'dataset/images/val'
val_label_dir = 'dataset/labels/val'

In [3]:
def save_video_frames(video_name, output_dir, debug=False):
    """
    This function extracts frames from a video and saves them as individual image files.
    
    Args:
    - video_name (str): Path to the video file.
    - output_dir (str): Directory where the extracted frames will be saved.
    """
    # Create the output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Capture the video using OpenCV
    video_capture = cv2.VideoCapture(video_name)
    
    if not video_capture.isOpened():
        print(f"Error: Unable to open video file {video_name}")
        return
    
    frame_number = 0
    skip_frame = 5  # Process one frame every 5 frames
    
    while True:
        # Read the next frame
        ret, frame = video_capture.read()
        if not ret:
            break  # Exit the loop if there are no more frames

        # Skip frames based on the defined skip_frame value
        if frame_number % skip_frame != 0:
            frame_number += 1
            continue
        
        # Construct the output filename for the frame
        output_filename = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(video_name))[0]}-{frame_number}.jpg")
        
        # Save the frame as an image file
        cv2.imwrite(output_filename, frame)
        if debug:
            print(f"Saved: {output_filename}")
        
        frame_number += 1
    
    # Release the video capture object
    video_capture.release()
    if debug:
        print("Finished saving frames.")

In [4]:
def process_videos(source_dir, dest_dir):
    """
    This function processes video files from a source directory, extracting frames and saving them
    into separate directories for training and validation.
    
    Args:
    - source_dir (str): Path to the directory containing the video files.
    - dest_dir (str): Directory where the processed frames will be stored (train and validation directories).
    """
    # Get a sorted list of video files from the source directory
    video_files = sorted(os.listdir(source_dir))

    # Ensure train and val directories exist in the destination directory
    train_dir = os.path.join(dest_dir, 'train')
    val_dir = os.path.join(dest_dir, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # Loop through videos and process them with tqdm progress bar
    for idx, video_name in tqdm(enumerate(video_files), total=len(video_files), desc="Processing Videos"):
        # Determine whether the video should go into the train or validation directory
        output_dir = val_dir if idx % 2 == 0 else train_dir

        # Full path to the video file
        video_path = os.path.join(source_dir, video_name)
        
        # Call the save_video_frames function to extract and save frames
        save_video_frames(video_path, output_dir)

In [5]:
# start processing the videos
process_videos('train_videos/train_videos', 'dataset/images')

Processing Videos: 100%|█████████████████████████████████████████████| 77/77 [05:31<00:00,  4.30s/it]


In [6]:
def get_video_resolution(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None
    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    cap.release()
    return int(width), int(height)

def get_videos_resolutions(directory):
    video_resolutions = {}
    video_extensions = ('.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.mpg', '.mpeg')
    for filename in os.listdir(directory):
        if filename.lower().endswith(video_extensions):
            video_path = os.path.join(directory, filename)
            resolution = get_video_resolution(video_path)
            filename = filename.split('.')[0]
            video_resolutions[filename] = resolution
    return video_resolutions

# Get resolutions for each video
directory = 'train_videos/train_videos'
resolutions = get_videos_resolutions(directory)

In [7]:
def convert_label(img_dir, label_dir, source_label_dir, debug):
    """
    Converts bounding box labels from a custom format to the YOLO format.
    
    Args:
        img_dir (str): Directory containing image files.
        label_dir (str): Directory where converted YOLO labels will be saved.
        source_label_dir (str): Directory containing original label files.
        debug (bool): If True, displays debug information and bounding box visualization.
    """
    for path in sorted(os.listdir(img_dir)):    
        # full path (for debugging)
        full_path = os.path.join(img_dir, path)
    
        # get video name and frame index
        path = os.path.basename(path)
        path = path.replace('.jpg', '')    
        split = path.rfind('-')
        video_name = path[:split]
        frame_idx = int(path[split+1:])
    
        # read original label
        label_name = os.path.join(source_label_dir, video_name + '.txt')
        with open(label_name, 'r') as file:
            lines = file.readlines() 
    
        line = lines[frame_idx].replace('\n', '').split()
        labels = line[2:] # 1st one is frame index, 2nd is number of objects
        labels = [labels[i:i + 5] for i in range(0, len(labels), 5)]
    
        # get the video resolution
        resolution = resolutions[video_name]    
        w_max, h_max = resolution
    
        # debug log
        if debug:
            print(path)
            print(video_name)
            print(label_name)
            print(line)
            print(labels)
            print(resolution)
            image = cv2.imread(full_path)
    
        # process the label
        new_labels = []
        for label in labels:
            # get item and convert to int
            x1, y1, w, h, cls = label
            x1, y1, w, h = int(x1), int(y1), int(w), int(h)
            
            # convert to yolo format
            xc  = (x1 + (w / 2)) / w_max
            yc  = (y1 + (h / 2)) / h_max
            w   = w / w_max
            h   = h / h_max
            cls = 0 # only one class  
    
            # debug
            if debug:
                debug_x1 = int((xc - (w / 2)) * w_max)
                debug_y1 = int((yc - (h / 2)) * h_max)
                debug_x2 = int((xc + (w / 2)) * w_max)
                debug_y2 = int((yc + (h / 2)) * h_max)
                cv2.rectangle(image, (debug_x1, debug_y1), (debug_x2, debug_y2), (255, 0, 0), 2)   
    
            new_labels.append(f'{cls} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}')
        
        if debug:
            print(new_labels)
            plt.imshow(image)
            plt.show()    
    
        # Open the file in write mode
        label_path = path + '.txt'
        label_path = os.path.join(label_dir, label_path)
        
        with open(label_path, "w") as file:
            # Iterate through the list and write each string as a line
            for new_label in new_labels:
                file.write(new_label + "\n")    

In [8]:
convert_label(train_img_dir, train_label_dir, source_label_dir='annotations', debug=False)
convert_label(val_img_dir, val_label_dir, source_label_dir='annotations', debug=False)

In [9]:
# create the data yaml for this dataset

data = """
path: ../dataset # dataset root dir
train: images/train # train images (relative to 'path')
val: images/val # val images (relative to 'path')

# Classes (only one class)
names:
    0: drone
"""

# Open a file in write mode
with open('dataset/data.yaml', 'w') as file:
    # Split the data into lines and write each line to the file
    for line in data.strip().split('\n'):
        file.write(line + '\n')

print("Data has been written to dataset/data.yaml")


Data has been written to dataset/data.yaml
