# Fun demos :)

##### And possible ideas for a future project / thesis

#### Detect Faces

In [2]:
import cv2

# Load image and convert to grayscale
img = cv2.imread("/Users/merterol/Desktop/PCL Tutorial Demos/Tutorial 12/trump_happy.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Load Haar cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# Detect faces
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

# Draw rectangles around faces
for (x, y, w, h) in faces:
    cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2)

# Show result
cv2.imshow("Face Detection", img)
cv2.waitKey(0)
cv2.destroyAllWindows()


#### Detect Emotion in faces

In [None]:
import cv2
from fer import FER

# Load image
img = cv2.imread("/Users/merterol/Desktop/PCL Tutorial Demos/Tutorial 12/trump_sad.webp")

# Detect emotions
detector = FER(mtcnn=True)
results = detector.detect_emotions(img)

# Draw boxes and emotions
for face in results:
    (x, y, w, h) = face["box"]
    emotion, score = max(face["emotions"].items(), key=lambda item: item[1])
    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.putText(img, f"{emotion} ({score:.2f})", (x, y - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

# Show result
cv2.imshow("Emotion Detection", img)
cv2.waitKey(0)
cv2.destroyAllWindows()


2025-05-22 21:32:22.263448: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


: 

#### Draw with your voice

In [3]:
import speech_recognition as sr
from PIL import Image, ImageDraw
import random

# Create a blank canvas
img = Image.new("RGB", (400, 400), "white")
draw = ImageDraw.Draw(img)

# Define shape drawing based on voice input
def draw_shape(command):
    x, y = random.randint(50, 350), random.randint(50, 350)
    size = 50
    if "circle" in command:
        draw.ellipse([x-size, y-size, x+size, y+size], fill="blue")
    elif "square" in command:
        draw.rectangle([x-size, y-size, x+size, y+size], fill="red")
    elif "triangle" in command:
        draw.polygon([(x, y-size), (x-size, y+size), (x+size, y+size)], fill="green")

# Use speech recognition to get a command
recognizer = sr.Recognizer()
mic = sr.Microphone()

print("Say a shape (circle, square, triangle)...")
with mic as source:
    recognizer.adjust_for_ambient_noise(source)
    audio = recognizer.listen(source)

try:
    command = recognizer.recognize_google(audio)
    print("You said:", command)
    draw_shape(command.lower())
    img.show()
except Exception as e:
    print("Error:", e)

Say a shape (circle, square, triangle)...
You said: Circle


#### Mood detector

In [None]:
import cv2
from fer import FER

# Load webcam and emotion detector
cap = cv2.VideoCapture(0)
detector = FER(mtcnn=True)

print("Press ESC to exit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Detect emotions
    result = detector.detect_emotions(frame)
    for face in result:
        (x, y, w, h) = face["box"]
        emotion, score = max(face["emotions"].items(), key=lambda item: item[1])
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        cv2.putText(frame, f"{emotion} ({score:.2f})", (x, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)

    cv2.imshow("Mood Detector", frame)
    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
#cv2.destroyAllWindows()


Press ESC to exit.


: 

#### Video to ASCII

In [1]:
import cv2
import os

ASCII_CHARS = "@%#*+=-:. "
WIDTH = 100

def frame_to_ascii(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    height, width = gray.shape
    aspect_ratio = height / width
    new_height = int(WIDTH * aspect_ratio * 0.55)
    resized = cv2.resize(gray, (WIDTH, new_height))

    ascii_str = ""
    for row in resized:
        for pixel in row:
            ascii_str += ASCII_CHARS[pixel * len(ASCII_CHARS) // 256]
        ascii_str += "\n"
    return ascii_str

cap = cv2.VideoCapture("/Users/merterol/Desktop/PCL Tutorial Demos/Tutorial 12/Lecture/me-at-the-zoo.mp4")
os.system('cls' if os.name == 'nt' else 'clear')

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    ascii_frame = frame_to_ascii(frame)
    print("\033c", end="")  # clear screen
    print(ascii_frame)
    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()


[H[2Jc######*******++++***++=#+*+*#+#*+=::.*:..     .   ...  *  =++==+:--..  +:= .       :-:++-=-++=+.:.  
#*#*********++=+=*++++=*=   +-  ==#%#**=: .%*   -+++:. #+**#**#+**-:.:=*+=+.      .:--++++++++.:+*= 
*******++**+++====*=++=*=   +-  -= -+==***#*+-::=++==---:-*++##**=-=*****=-+  .  ..--=--::++-++=:-:-
******+++++##+====*+=+=*=.  +- :===-+-=++:-+-+*=+*+-::-=--=-+-+#*+--+****-=*==  .:+.=-*-. :.-..#=-  
####*#+++++++++---=#===*=+. +-+*+==++==*+++++**+===+=%%#+:.-+=-----=*+****+-::-==::++**-:-:--:-     
*******++%%*===*=-===-=*=+=++=+*+==*+-++++*-.:::..:==*%##%@@%++::-==##+==#+#=--=+==+++++=====---==..
************=*=+#=*==-=*=+  +=+++=+++=+=+==+****=++::==:.:::::**-%#=***#+#%#=--+-+==++++***++++=====
#*##******##*+=+=+==*=+*==..*==++==++=+*+=*%+*#%*##%%=-+=::..:..::-=******#+--=+*+++=++=+**********+
*****++++*+++**++++==+**==++====+++=+=+****##%%%%####*+*#*###++#%%%%@@@@@%%###+++#***+*+++++###*****
*****+++*+*+++*++=+===+*+++=====++***=+***###%%%##*#*#%#####+#*###%%%%@@@%%#%%**+*

KeyboardInterrupt: 

#### Pose estimation

In [None]:

import cv2
import mediapipe as mp

mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

cap = cv2.VideoCapture("/Users/merterol/Desktop/PCL Tutorial Demos/Tutorial 12/Lecture/me-at-the-zoo.mp4")

with mp_pose.Pose(min_detection_confidence=0.5,
            min_tracking_confidence=0.5) as pose:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert BGR to RGB and process
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = pose.process(image)

        # Draw pose annotation on the image.
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(
                image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

        cv2.imshow("Pose Estimation", image)
        if cv2.waitKey(5) & 0xFF == 27:
            break

cap.release()
cv2.destroyAllWindows()


2025-05-22 21:12:31.787857: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1747941160.835920   55980 gl_context.cc:369] GL version: 2.1 (2.1 ATI-4.14.1), renderer: AMD Radeon Pro 580 OpenGL Engine
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1747941160.985461   56263 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1747941161.019012   56264 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1747941161.036778   56263 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supporte

: 

# Last years Exam Task

You want to train an automatic speech recognition system. For this, you are compiling a corpus
of short audio snippets and their corresponding transcribed text. You found and downloaded
some videos from YouTube that have subtitles available in SRT format. In this task, you will
write a CLI to extract the subtitle texts and the corresponding audio snippets and store them
in separate files.

#### Part 1: Extracting audio snippets

- In the SRT files, each subtitle has an associated start and end time. The extracted audio
snippets should be trimmed accordingly.
- The audio snippets should be saved as mono (!) WAV files. They should be named as
shown in Figure 3.
- The corresponding subtitles (i.e., just the transcribed texts) should be saved in text files
on a single line (!) with UTF-8 encoding. They should be named as shown in Figure 3.
- No additional output files should be created besides the WAV and text files.

#### Part 2: Command-line interface

The CLI call should look like this:
$ python extract.py examples/video1.mp4 examples/video1.srt --outdir output

- If --outdir is specified, the resulting WAV and text files should be saved in the specified
directory. If --outdir is not specified, they should be saved in the current working
directory.
- The names of the WAV and text files should be based on the input video file name

In [None]:
import os
import argparse
from moviepy.editor import VideoFileClip

def parse_srt(srt_file):  # Defines a function to parse the .srt subtitle file
    with open(srt_file, 'r', encoding='utf-8') as file:
        content = file.read().strip()  # Reads the entire .srt file and removes leading/trailing whitespace
            
    blocks = content.split('\n\n')  # Splits subtitles into blocks separated by blank lines
    subtitles = []  # List to store subtitle text lines
    time_periods = []  # List to store corresponding start and end times

    for block in blocks:  # Iterates through each subtitle block
        lines = block.split('\n')  # Splits a block into individual lines
        if len(lines) >= 3:  # Ensures the block has at least an index, timing, and text
            # Extract start and end time
            start_end = lines[1].split(' --> ')  # Splits the timing line into start and end timestamps
            start_time = start_end[0]  # Captures the start timestamp
            end_time = start_end[1]  # Captures the end timestamp
            
            # Extract subtitle text
            subtitle_text = ' '.join(lines[2:]).replace('\n', ' ').strip()  # Joins all subtitle lines into a single string
            subtitles.append(subtitle_text)  # Appends the text to the subtitles list
            time_periods.append((start_time, end_time))  # Appends the time tuple to the time_periods list
        
    return subtitles, time_periods  # Returns both lists for further processing


def save_subtitles(subtitles, base_filename, outdir):  # Defines a function to save each subtitle to a separate .txt file
    os.makedirs(outdir, exist_ok=True)  # Creates the output directory if it doesn't already exist
    for i, subtitle in enumerate(subtitles):  # Iterates through all subtitles with their index
        output_filename = os.path.join(outdir, f"{base_filename}_{i+1}.txt")  # Constructs an output filename for each subtitle
        with open(output_filename, 'w', encoding='utf-8') as output_file:
            output_file.write(subtitle)  # Writes subtitle text into the file
        print(f"Saved: {output_filename}")  # Prints confirmation of saved file


def process_srt(input_file, outdir='.'):
    base_filename = os.path.splitext(os.path.basename(input_file))[0]  # Derives the base filename without extension
    subtitles, time_periods = parse_srt(input_file)  # Parses the .srt file into subtitles and time_periods lists
    # Save subtitles to individual text files
    save_subtitles(subtitles, base_filename, outdir)  # Calls save_subtitles to output .txt files
    return time_periods  # Returns the list of time tuples for audio extraction


def create_audio_clips(video_file, time_periods, output_dir, output_prefix):  # Defines a function to extract audio segments
    os.makedirs(output_dir, exist_ok=True)  # Ensures the audio output directory exists
    for idx, (start, end) in enumerate(time_periods):  # Iterates through each time period with its index
        clip = VideoFileClip(video_file).subclip(start, end)  # Subclips the video between specified start and end times
        audio_file = os.path.join(output_dir, f"{output_prefix}_{idx + 1}.wav")  # Constructs the output .wav filename
        clip.audio.write_audiofile(audio_file, codec='pcm_s16le')  # Exports the audio track as WAV with PCM codec
        print(f"Exported: {output_prefix}_{idx + 1}.wav")  # Prints confirmation of exported audio file


if __name__ == "__main__":  # Ensures this block runs only when script is executed directly
    parser = argparse.ArgumentParser(description='Extract audio clips and subtitles from video based on SRT file')  # Sets up argument parser
    parser.add_argument('video_file', type=str, help='Path to the video file')  # Adds required video file argument
    parser.add_argument('srt_file', type=str, help='Path to the SRT file')  # Adds required subtitle file argument
    parser.add_argument('--outdir', type=str, default='output', help='Output directory')  # Adds optional output directory argument
    parser.add_argument('--prefix', type=str, default='output', help='Output prefix for audio clips')  # Adds optional prefix for audio filenames
    args = parser.parse_args()  # Parses the provided command-line arguments

    time_periods = process_srt(args.srt_file, args.outdir)  # Processes the .srt file and saves subtitles
    create_audio_clips(args.video_file, time_periods, args.outdir, args.prefix)  # Extracts and saves audio clips based on time_periods
