In [1]:
# imports
import os, glob, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import cv2
import numpy as np
import librosa
import moviepy.editor as mp
from skimage import color
from skimage.feature import hog, local_binary_pattern, graycomatrix, graycoprops
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# config
SEED = 12332287
np.random.seed(SEED)

CLASSES = ["MissPiggy", "OtherPigs", "SwedishChef", "Rowlf"]

# helpers
def list_files(directory):
    """list files in a folder"""
    return sorted(glob.glob(directory + "/*"))

GT_REQUIRED = [
    "Video", "Frame_number", "Timestamp",
    "Kermit", "Pigs", "Miss Piggy", "Cook",
    "StatlerWaldorf", "Rowlf the Dog", "Fozzie Bear"
]

GT_TO_INTERNAL = {
    "Miss Piggy": "MissPiggy",
    "Pigs": "OtherPigs",
    "Cook": "SwedishChef",
    "Rowlf the Dog": "Rowlf",
}

def read_ground_truth(gt_dir):
    """reads all ground truth xlsx files and returns df indexed by (Video, Frame_number)"""
    files = sorted(glob.glob(os.path.join(gt_dir, "*.xlsx")))
    if not files:
        raise FileNotFoundError("no ground truth xlsx found in gt_dir")

    dfs = []
    for path in files:
        df = pd.read_excel(path)

        # verify headers
        missing = [c for c in GT_REQUIRED if c not in df.columns]
        if missing:
            raise ValueError(f"{os.path.basename(path)} missing columns: {missing}")

        # enforce numeric dtypes
        df["Video"] = pd.to_numeric(df["Video"], errors="coerce").astype("Int64")
        df["Frame_number"] = pd.to_numeric(df["Frame_number"], errors="coerce").astype("Int64")

        # map exact GT columns to internal class names
        for gt_col, internal in GT_TO_INTERNAL.items():
            df[internal] = pd.to_numeric(df[gt_col], errors="coerce").fillna(0).astype(int)

        dfs.append(df[["Video", "Frame_number"] + list(GT_TO_INTERNAL.values())])

    gt = pd.concat(dfs, ignore_index=True)
    gt = gt.dropna(subset=["Video", "Frame_number"])
    gt["Video"] = gt["Video"].astype(int)
    gt["Frame_number"] = gt["Frame_number"].astype(int)

    gt = gt.set_index(["Video", "Frame_number"]).sort_index()

    for c in CLASSES:
        if c not in gt.columns:
            gt[c] = 0
        gt[c] = gt[c].astype(int)

    return gt

def get_label(video_id, frame_idx, gt_df):
    """
    return one of the internal labels in CLASSES or None
    based on the ground truth table
    """
    try:
        row = gt_df.loc[(video_id, frame_idx)]
    except KeyError:
        return None

    # row contains columns: MissPiggy, OtherPigs, SwedishChef, Rowlf
    # only accept frames where exactly one of these is 1
    vals = row[CLASSES].values.astype(int)
    if vals.sum() != 1:
        return None

    label_idx = vals.argmax()
    return CLASSES[label_idx]

def video_iter_frames(path):
    """go through video frame by frame and yield (idx, frame_bgr, fps, nframes)"""
    capture = cv2.VideoCapture(path) # open video
    if not capture.isOpened():
        raise RuntimeError(f"cannot open video: {path}")

    fps = capture.get(cv2.CAP_PROP_FPS) # frames per second
    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

    frame_index = 0
    try:
        while True:
            ok, frame = capture.read() # read next frame
            if not ok:
                break
            yield frame_index, frame, fps, total_frames
            frame_index += 1
    finally:
        capture.release()

def video_to_audio(video_path):
    """extract audio from video and save as wav file"""
    fname = os.path.basename(video_path).replace(".avi", ".wav")
    audio_out = os.path.join("data/audio", fname)  # save in data/audio

    # load video file and extract audio
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_out, codec="pcm_s16le")
    video.close()

In [2]:
gt_df = read_ground_truth("./data/ground_truth")

In [3]:
# separating audio from video
video_paths = list_files("./data/episodes")
for v in video_paths:
    video_to_audio(v)

MoviePy - Writing audio in data/audio/211.wav


                                                                        

MoviePy - Done.
MoviePy - Writing audio in data/audio/244.wav


                                                                        

MoviePy - Done.
MoviePy - Writing audio in data/audio/343.wav


                                                                        

MoviePy - Done.




### Visual Feature Extraction

In [4]:
def extract_hog_frame(gray_frame):
    """extract HOG features from a single grayscale frame"""
    h = hog(gray_frame, orientations=9, pixels_per_cell=(8,8), cells_per_block=(2,2))
    return h


def extract_lbp_frame(gray_frame, P=8, R=1):
    """extract LBP features from a single grayscale frame"""
    lbp = local_binary_pattern(gray_frame, P, R, method="uniform") # compute lbp codes for each pixel

    # lbp produces values in [0, P+1], build histogram as the feature
    bins = P + 2
    hist, _ = np.histogram(lbp.ravel(), bins=bins, range=(0, bins), density=True)
    return hist


def extract_glcm_frame(gray_frame):
    """extract GLCM features from a single grayscale frame"""
    # compute grey-level co-occurrence matrix for distance=1, angle=0
    gc = graycomatrix(gray_frame, [1], [0], symmetric=True, normed=True) 

    # extract two common texture measures: contrast + homogeneity
    return np.array([
        graycoprops(gc, "contrast")[0, 0],
        graycoprops(gc, "homogeneity")[0, 0]
    ])


def extract_sift_frame(gray_frame):
    """extract SIFT features from a single grayscale frame"""
    # create sift detector
    sift = cv2.SIFT_create()

    # detect keypoints and compute descriptors
    kp, des = sift.detectAndCompute(gray_frame, None)

    # handle frames where sift finds nothing
    if des is None:
        return np.zeros(128)   # sift descriptor size is always 128

    # average all descriptors to get a fixed-size feature vector
    return des.mean(axis=0)


### Audio Feature Extraction

In [5]:
def extract_audio_features_vectorized(audio_path, fps, total_frames, n_mfcc=20):
    # 1. Load the full audio track
    y, sr = librosa.load(audio_path, sr=None)
    
    # 2. Calculate hop_length to sync with video FPS
    # This aligns the audio analysis window exactly with the video frame rate
    hop_length = int(sr / fps)
    
    # 3. Compute Features globally (Vectorized)
    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
    
    # Spectral Features
    cent = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
    bw = librosa.feature.spectral_bandwidth(y=y, sr=sr, hop_length=hop_length)
    con = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)
    
    # Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
    
    # Spectral Flux (Onset Strength)
    flux = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    
    # 4. Helper to transpose and fix length mismatch
    # Librosa often returns +/- 1 frame compared to OpenCV due to rounding.
    def fix_and_transpose(features, target_len):
        # Transpose so shape becomes (n_frames, n_features)
        if features.ndim > 1:
            features = features.T
        else:
            features = features.reshape(-1, 1)
            
        current_len = features.shape[0]
        
        # If audio is shorter than video, pad with zeros
        if current_len < target_len:
            pad_width = target_len - current_len
            # Pad the time dimension (axis 0)
            if features.ndim == 2:
                features = np.pad(features, ((0, pad_width), (0, 0)), mode='constant')
            else:
                features = np.pad(features, (0, pad_width), mode='constant')
                
        # If audio is longer, just trim it
        return features[:target_len]

    # 5. Return dict (Note: Spectral Contrast has 7 bands, so we stack it with others)
    # We stack the scalar spectral features (centroid, bandwidth) for easier handling
    spec_stacked = np.hstack([
        fix_and_transpose(cent, total_frames),
        fix_and_transpose(bw, total_frames),
        fix_and_transpose(con, total_frames)
    ])

    return {
        "mfcc":   fix_and_transpose(mfcc, total_frames),
        "spec":   spec_stacked,
        "chroma": fix_and_transpose(chroma, total_frames),
        "flux":   fix_and_transpose(flux, total_frames)
    }

### Building Datasets

In [6]:
# visual features
hog_features   = []
lbp_features   = []
glcm_features  = []
sift_features  = []

# audio features
mfcc_frames    = []
spec_frames    = []
chroma_frames  = []
flux_frames    = []

frame_meta       = []  # list of (video_id, frame_idx)
labels_per_frame = []

video_paths = list_files("./data/episodes")

for video_path in video_paths:
    video_id = int(os.path.splitext(os.path.basename(video_path))[0])
    print("processing video:", video_id)

    # get fps and total frames
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    # load per-frame audio features
    audio_path = f"./data/audio/{video_id}.wav"
    audio_feats = extract_audio_features_vectorized(
        audio_path,
        fps,
        total_frames,
        n_mfcc=20
    )

    # loop over frames and grab both visual+audio features
    for frame_idx, frame_bgr, fps, total_frames in video_iter_frames(video_path):
        # get label
        label = get_label(video_id, frame_idx, gt_df)
        if label is None:
            continue

        frame_bgr = cv2.resize(frame_bgr, (128, 128), interpolation=cv2.INTER_LINEAR)
        
        # convert frame to grayscale
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        grayscale = color.rgb2gray(frame_rgb)
        grayscale = (grayscale * 255).astype(np.uint8)

        # meta + label
        frame_meta.append((video_id, frame_idx))
        labels_per_frame.append(label)

        # visual features
        hog_features.append(extract_hog_frame(grayscale))
        lbp_features.append(extract_lbp_frame(grayscale))
        glcm_features.append(extract_glcm_frame(grayscale))
        sift_features.append(extract_sift_frame(grayscale))

        # audio features: just index by frame_idx
        mfcc_frames.append(audio_feats["mfcc"][frame_idx])
        spec_frames.append(audio_feats["spec"][frame_idx])
        chroma_frames.append(audio_feats["chroma"][frame_idx])
        flux_frames.append(audio_feats["flux"][frame_idx])

# convert to arrays
X_hog_frames = np.vstack(hog_features)
X_lbp_frames = np.vstack(lbp_features)
X_glcm_frames = np.vstack(glcm_features)
X_sift_frames = np.vstack(sift_features)

X_mfcc_frames = np.vstack(mfcc_frames)
X_spec_frames = np.vstack(spec_frames)
X_chroma_frames = np.vstack(chroma_frames)
X_flux_frames = np.vstack(flux_frames)

y_labels = np.array(labels_per_frame)

processing video: 211
processing video: 244
processing video: 343


In [7]:
print("\n================ visual feature matrices ================")
print("X_hog_frames   :", X_hog_frames.shape)
print("X_lbp_frames   :", X_lbp_frames.shape)
print("X_glcm_frames  :", X_glcm_frames.shape)
print("X_sift_frames  :", X_sift_frames.shape)

print("\n================ audio feature matrices =================")
print("X_mfcc_frames  :", X_mfcc_frames.shape)
print("X_spec_frames  :", X_spec_frames.shape)
print("X_chroma_frames:", X_chroma_frames.shape)
print("X_flux_frames  :", X_flux_frames.shape)

print("\nnumber of labels:", len(y_labels))


X_hog_frames   : (19877, 8100)
X_lbp_frames   : (19877, 10)
X_glcm_frames  : (19877, 2)
X_sift_frames  : (19877, 128)

X_mfcc_frames  : (19877, 20)
X_spec_frames  : (19877, 9)
X_chroma_frames: (19877, 12)
X_flux_frames  : (19877, 1)

number of labels: 19877


### Comparing Features

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import f1_score

def evaluate_feature(X, y, name):
    # split once, same split for all features
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y
    )

    # scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = scaler.transform(X_test)

    # svm classifier
    clf = SVC(kernel="rbf", C=10, gamma="scale")
    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)
    f1 = f1_score(y_test, pred, average="macro")

    print(f"{name:12s}  f1 = {f1:.4f}")
    return f1

In [9]:
visual_features = [
    (X_hog_frames,   "HOG"),
    (X_lbp_frames,   "LBP"),
    (X_glcm_frames,  "GLCM"),
    (X_sift_frames,  "SIFT"),
]

audio_features = [
    (X_mfcc_frames,   "MFCC"),
    (X_spec_frames,   "Spectral"),
    (X_chroma_frames, "Chroma"),
    (X_flux_frames,   "Flux"),
]

print("\n---- visual feature comparison ----")
visual_scores = []
for X, name in visual_features:
    f1 = evaluate_feature(X, y_labels, name)
    visual_scores.append((name, f1))

print("\n---- audio feature comparison ----")
audio_scores = []
for X, name in audio_features:
    f1 = evaluate_feature(X, y_labels, name)
    audio_scores.append((name, f1))

visual_ranked = sorted(visual_scores, key=lambda x: x[1], reverse=True)
audio_ranked  = sorted(audio_scores,  key=lambda x: x[1], reverse=True)

print("\n===== ranking: visual features =====")
for name, f1 in visual_ranked:
    print(f"{name:12s}  f1 = {f1:.4f}")

print("\n===== ranking: audio features =====")
for name, f1 in audio_ranked:
    print(f"{name:12s}  f1 = {f1:.4f}")


---- visual feature comparison ----
HOG           f1 = 0.9997
LBP           f1 = 0.9836
GLCM          f1 = 0.8156
SIFT          f1 = 0.9891

---- audio feature comparison ----
MFCC          f1 = 0.8296
Spectral      f1 = 0.5399
Chroma        f1 = 0.4268
Flux          f1 = 0.2497

===== ranking: visual features =====
HOG           f1 = 0.9997
SIFT          f1 = 0.9891
LBP           f1 = 0.9836
GLCM          f1 = 0.8156

===== ranking: audio features =====
MFCC          f1 = 0.8296
Spectral      f1 = 0.5399
Chroma        f1 = 0.4268
Flux          f1 = 0.2497


SIFT and HOG achieve near-perfect visual classification performance, while MFCC is the only audio feature that meaningfully contributes