In [18]:
# imports
import numpy as np
import pandas as pd
import os, glob, warnings
import cv2
warnings.filterwarnings("ignore")

from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

# config
SEED = 12332287
np.random.seed(SEED)

CLASSES = ["MissPiggy", "OtherPigs", "SwedishChef", "Rowlf"]

# helpers
def list_files(directory):
    """list files in a folder"""
    return sorted(glob.glob(directory + "/*"))

GT_REQUIRED = [
    "Video", "Frame_number", "Timestamp",
    "Kermit", "Pigs", "Miss Piggy", "Cook",
    "StatlerWaldorf", "Rowlf the Dog", "Fozzie Bear"
]

GT_TO_INTERNAL = {
    "Miss Piggy": "MissPiggy",
    "Pigs": "OtherPigs",
    "Cook": "SwedishChef",
    "Rowlf the Dog": "Rowlf",
}

def read_ground_truth(gt_dir):
    """reads all ground truth xlsx files and returns df indexed by (Video, Frame_number)"""
    files = sorted(glob.glob(os.path.join(gt_dir, "*.xlsx")))
    if not files:
        raise FileNotFoundError("no ground truth xlsx found in gt_dir")

    dfs = []
    for path in files:
        df = pd.read_excel(path)

        # verify headers
        missing = [c for c in GT_REQUIRED if c not in df.columns]
        if missing:
            raise ValueError(f"{os.path.basename(path)} missing columns: {missing}")

        # enforce numeric dtypes
        df["Video"] = pd.to_numeric(df["Video"], errors="coerce").astype("Int64")
        df["Frame_number"] = pd.to_numeric(df["Frame_number"], errors="coerce").astype("Int64")

        # map exact GT columns to internal class names
        for gt_col, internal in GT_TO_INTERNAL.items():
            df[internal] = pd.to_numeric(df[gt_col], errors="coerce").fillna(0).astype(int)

        dfs.append(df[["Video", "Frame_number"] + list(GT_TO_INTERNAL.values())])

    gt = pd.concat(dfs, ignore_index=True)
    gt = gt.dropna(subset=["Video", "Frame_number"])
    gt["Video"] = gt["Video"].astype(int)
    gt["Frame_number"] = gt["Frame_number"].astype(int)

    gt = gt.set_index(["Video", "Frame_number"]).sort_index()

    for c in CLASSES:
        if c not in gt.columns:
            gt[c] = 0
        gt[c] = gt[c].astype(int)

    return gt

def video_iter_frames(path):
    """go through video frame by frame and yield (idx, frame_bgr, fps, nframes)"""
    capture = cv2.VideoCapture(path) # open video
    if not capture.isOpened():
        raise RuntimeError(f"cannot open video: {path}")

    fps = capture.get(cv2.CAP_PROP_FPS) # frames per second
    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

    frame_index = 0
    try:
        while True:
            ok, frame = capture.read() # read next frame
            if not ok:
                break
            yield frame_index, frame, fps, total_frames
            frame_index += 1
    finally:
        capture.release()

In [19]:
list_files("./data/episodes")

['./data/episodes/211.avi',
 './data/episodes/244.avi',
 './data/episodes/343.avi']

In [20]:
read_ground_truth("./data/ground_truth")

Unnamed: 0_level_0,Unnamed: 1_level_0,MissPiggy,OtherPigs,SwedishChef,Rowlf
Video,Frame_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
211,0,0,0,0,0
211,1,0,0,0,0
211,2,0,0,0,0
211,3,0,0,0,0
211,4,0,0,0,0
...,...,...,...,...,...
343,38493,0,0,0,0
343,38494,0,0,0,0
343,38495,0,0,0,0
343,38496,0,0,0,0


### Visual Feature Extractors
Extracting HOG to find the silhouette of pig snouts or ears, the chef's hat, and rowlf’s ears.

In [21]:
def extract_hog(video_path, gt, frame_step=5, max_frames=None):
    """
    build hog features + labels for a single video.
    returns:
        X  : np.array [n_samples, n_features]
        y  : np.array [n_samples] of class names
        meta : list of (video_id, frame_idx)
    """
    X = []
    y = []
    meta = []

    # filename is "244.avi" and gt Video column is 244
    video_id = int(os.path.splitext(os.path.basename(video_path))[0])

    used_frames = 0

    for index, frame_bgr, fps, total_frames in video_iter_frames(video_path):

        # sample every kth frame
        if index % frame_step != 0:
            continue

        key = (video_id, index)
        if key not in gt.index:
            continue

        row = gt.loc[key]
        label_vec = row[CLASSES].values.astype(int)

        # skip frames without target characters
        if label_vec.sum() == 0:
            continue

        # choose the first active class as label
        label = CLASSES[label_vec.argmax()]

        # hog features
        gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
        gray = cv2.resize(gray, (128, 128))
        feats = hog(
            gray,
            orientations=9,
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2),
            block_norm="L2-Hys",
            feature_vector=True,
        )

        X.append(feats)
        y.append(label)
        meta.append((video_id, index))

        used_frames += 1
        if max_frames is not None and used_frames >= max_frames:
            break

    if not X:
        raise RuntimeError(f"no usable frames for {video_path}")

    return np.vstack(X), np.array(y), meta

In [22]:
# paths
episodes_dir = "data/episodes"
gt_dir = "data/ground_truth"

# load gt and videos
gt = read_ground_truth(gt_dir)
audio_paths = list_files(episodes_dir)

all_x = []
all_y = []
all_meta = []

# loop over all videos and extract hog
for audio_file in audio_paths:
    print("processing", audio_file)
    x_v, y_v, meta_v = extract_hog(
        audio_file,
        gt,
        frame_step=5,
        max_frames=1000,
    )
    all_x.append(x_v)
    all_y.append(y_v)
    all_meta.extend(meta_v)

# stack features and labels from all videos
x_hog = np.vstack(all_x)
y_hog = np.hstack(all_y)

print("X_hog shape:", x_hog.shape)
print("y_hog shape:", y_hog.shape)
print("class counts:", pd.Series(y_hog).value_counts())

processing data/episodes/211.avi
processing data/episodes/244.avi
processing data/episodes/343.avi
X_hog shape: (3000, 8100)
y_hog shape: (3000,)
class counts: OtherPigs      1244
Rowlf           819
MissPiggy       743
SwedishChef     194
Name: count, dtype: int64


In [23]:
print(x_hog)

[[0.26689535 0.1466283  0.09171098 ... 0.         0.33876278 0.        ]
 [0.26873102 0.15703764 0.06709208 ... 0.         0.30095953 0.        ]
 [0.27175199 0.11865204 0.07944683 ... 0.         0.28252492 0.        ]
 ...
 [0.3168523  0.0769396  0.05145164 ... 0.23355003 0.20724286 0.23251784]
 [0.31956112 0.07839751 0.05242659 ... 0.21586046 0.25690001 0.11086122]
 [0.32404294 0.07925723 0.05300151 ... 0.2382452  0.2382452  0.19421682]]


In [24]:
print(y_hog)

['Rowlf' 'Rowlf' 'Rowlf' ... 'OtherPigs' 'OtherPigs' 'OtherPigs']


In [25]:
print(all_meta)

[(211, 1120), (211, 1125), (211, 1130), (211, 1135), (211, 1140), (211, 1145), (211, 1150), (211, 1155), (211, 1160), (211, 1165), (211, 1170), (211, 1175), (211, 1180), (211, 1185), (211, 1190), (211, 1195), (211, 1200), (211, 1205), (211, 1210), (211, 1215), (211, 1220), (211, 1225), (211, 1230), (211, 1235), (211, 1240), (211, 1245), (211, 1250), (211, 1255), (211, 1260), (211, 1265), (211, 1270), (211, 1275), (211, 1280), (211, 1285), (211, 1290), (211, 1295), (211, 1300), (211, 1305), (211, 1310), (211, 1315), (211, 1320), (211, 1325), (211, 1330), (211, 1335), (211, 1340), (211, 1650), (211, 1655), (211, 1660), (211, 1665), (211, 1670), (211, 1675), (211, 1680), (211, 1685), (211, 1690), (211, 1695), (211, 1700), (211, 1705), (211, 1710), (211, 1715), (211, 1720), (211, 1725), (211, 1750), (211, 1755), (211, 1760), (211, 1765), (211, 1770), (211, 1775), (211, 1780), (211, 1785), (211, 1790), (211, 1795), (211, 1800), (211, 1805), (211, 1810), (211, 1940), (211, 1945), (211, 1950)

In [26]:
episodes_dir = "data/episodes"
gt_dir = "data/ground_truth"

gt = read_ground_truth(gt_dir)
audio_paths = list_files(episodes_dir)

all_x = []
all_y = []
all_meta = []

for audio_file in audio_paths:
    print("processing", audio_file)
    x_v, y_v, meta_v = extract_hog(
        audio_file,
        gt,
        frame_step=5,
        max_frames=1000,
    )
    all_x.append(x_v)
    all_y.append(y_v)
    all_meta.extend(meta_v)

x_hog = np.vstack(all_x)
y_hog = np.hstack(all_y)

df = pd.DataFrame(x_hog)
df["label"] = y_hog
df.head()

processing data/episodes/211.avi
processing data/episodes/244.avi
processing data/episodes/343.avi


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8091,8092,8093,8094,8095,8096,8097,8098,8099,label
0,0.266895,0.146628,0.091711,0.043054,0.044173,0.0,0.0,0.018298,0.042063,0.273886,...,0.465317,0.093776,0.118619,0.0,0.167752,0.093776,0.0,0.338763,0.0,Rowlf
1,0.268731,0.157038,0.067092,0.02485,0.039128,0.0,0.018445,0.0,0.0,0.268731,...,0.352919,0.0,0.352919,0.0,0.352919,0.115202,0.0,0.30096,0.0,Rowlf
2,0.271752,0.118652,0.079447,0.039963,0.029741,0.0,0.0,0.021159,0.0,0.26106,...,0.315238,0.0,0.315238,0.0,0.315238,0.108146,0.0,0.282525,0.0,Rowlf
3,0.266466,0.113254,0.066911,0.052251,0.028388,0.0,0.0,0.020197,0.0,0.25311,...,0.328797,0.0,0.069159,0.10935,0.328797,0.10935,0.0,0.285672,0.0,Rowlf
4,0.271609,0.089053,0.071054,0.049336,0.029081,0.0,0.0,0.02069,0.0,0.223742,...,0.326625,0.0,0.069475,0.10985,0.326625,0.10985,0.0,0.286977,0.0,Rowlf


### Training Classifiers
With the visual data, we will train the classifiers.

In [27]:
# split data
x_train, x_test, y_train, y_test = train_test_split(
    x_hog, y_hog,
    test_size=0.2,
    random_state=SEED,
    stratify=y_hog
)

# classifier 1: SVM
svm_clf = Pipeline([
    # scale features so svm works well
    ("scaler", StandardScaler()),
    ("clf", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        random_state=SEED
    )),
])

In [28]:
# train: SVM
svm_clf.fit(x_train, y_train)
svm_pred = svm_clf.predict(x_test)

print("------------------------- SVM -------------------------")
print(classification_report(y_test, svm_pred))
print(confusion_matrix(y_test, svm_pred))


------------------------- SVM -------------------------
              precision    recall  f1-score   support

   MissPiggy       0.99      0.97      0.98       148
   OtherPigs       0.97      1.00      0.98       249
       Rowlf       1.00      0.99      0.99       164
 SwedishChef       1.00      0.95      0.97        39

    accuracy                           0.98       600
   macro avg       0.99      0.98      0.98       600
weighted avg       0.99      0.98      0.98       600

[[143   5   0   0]
 [  0 249   0   0]
 [  1   1 162   0]
 [  0   2   0  37]]


#### Interpretation

- Performance is very strong: 98% frame-level accuracy using classical HOG features with an SVM (rbf kernel).
- High precision (~1.00). The model almost never predicts a character that isn't actually there (very few false positives).
- Recall is near perfect for pigs and Rowlf, but slightly lower (0.95) for Swedish Chef because the class only has 39 frames. This small sample size causes a few missed detections.
- Confusion matrix shows:
  - 5 Miss Piggy frames predicted as OtherPigs (pig-heavy background + similar gradients),
  - 2 Swedish Chef frames confused as OtherPigs,
  - 2 Rowlf frames confused with pig classes (pose/background similarity)

### Audio Feature Extraction

In [29]:
import librosa
import numpy as np
import moviepy.editor as mp

def video_to_audio(video_path):
    fname = os.path.basename(video_path).replace(".avi", ".wav")
    audio_out = os.path.join("data/audio", fname)  # save in data/audio

    # load video file and extract audio
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_out, codec="pcm_s16le")
    video.close()

def extract_mfcc(audio_path, n_mfcc=20):
    y, sr = librosa.load(audio_path, sr=None) # load audio file
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) # compute mfccs
    # summarize over time
    mfcc_mean = mfccs.mean(axis=1)
    mfcc_std  = mfccs.std(axis=1)
    return np.concatenate([mfcc_mean, mfcc_std])  # single vector

In [30]:
# separating audio from video
video_paths = list_files("./data/episodes")
for v in video_paths:
    video_to_audio(v)

MoviePy - Writing audio in data/audio/211.wav


                                                                         

MoviePy - Done.
MoviePy - Writing audio in data/audio/244.wav


                                                                         

MoviePy - Done.
MoviePy - Writing audio in data/audio/343.wav


                                                                         

MoviePy - Done.




In [31]:
audio_paths = list_files("./data/audio")

# loop over all audios and extract mfcc
video_to_mfcc = {}   # maps video_id -> mfcc vector

for audio_file in sorted(audio_paths):
    print("processing", audio_file)
    vec = extract_mfcc(audio_file)

    # video id from filename: data/audio/211.wav -> 211
    fname = os.path.basename(audio_file)      # '211.wav'
    vid = int(os.path.splitext(fname)[0])    # 211

    video_to_mfcc[vid] = vec

X_mfcc = np.vstack(list(video_to_mfcc.values()))
print(X_mfcc)

processing ./data/audio/211.wav
processing ./data/audio/244.wav
processing ./data/audio/343.wav
[[-317.0725      163.33305     -52.551086     19.28571      -5.6979747
     3.4002426    -4.929971    -12.201356     -3.6104012    -9.246938
    -3.3847516    -2.126348     -3.1406577    -3.6806903    -3.1279316
    -1.9043978    -2.1631641    -1.0664868    -2.9470253    -2.4984314
    86.03214      35.17598      38.416107     25.774948     23.999083
    16.861155     12.86876      14.479614     11.357522     10.7253
    10.283087      8.93081       9.4097805     8.429411      8.150798
     8.019977      7.7327414     7.3084307     7.2715178     7.344952  ]
 [-327.12415     146.12303     -53.093285     23.13055     -11.519503
     7.5332294    -7.7935543    -8.777845     -2.2951272    -7.544058
    -1.8032686    -1.693064     -3.0165005    -1.6739421    -1.6428167
    -1.0939364    -0.66888976    0.43510795   -1.0586193    -2.052682
    89.97931      38.400227     36.370346     25.818521    

In [36]:
# build audio feature row for each frame
x_audio_per_frame = np.vstack([
    video_to_mfcc[vid]   # pick mfcc vector for that episode
    for (vid, frame_index) in all_meta
])

#combine audio + visual features
x_audiovisual = np.hstack([x_hog, x_audio_per_frame])
print("x_av shape:", x_audiovisual.shape)   # (3000, 8140) for example


x_av shape: (3000, 8140)
