In [1]:
import torch
import torchaudio
import whisper
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

ModuleNotFoundError: No module named 'whisper'

In [None]:
AUDIO_FILE = "audio.wav"
VIDEO_FILE = "video.mp4"
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
# Load audio & extract spectrogram
waveform, sample_rate = torchaudio.load(AUDIO_FILE)
print(f"Audio: {waveform.shape}, Sample: {sample_rate}")

# Compute spectrogram
spectro = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
spec_db = torchaudio.transforms.AmplitudeToDB()(spectro)

plt.figure(figsize=(8,4))
plt.imshow(spec_db[0].numpy(), aspect="auto", origin="lower")
plt.title("Spectrogram")
plt.colorbar()
plt.show()

# Extract MFCCs
mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=13)(waveform)
mfcc_features = mfcc.mean(dim=2).squeeze().numpy()
print("MFCC:", mfcc_features)


In [None]:
# Use whisper to transform audio 
model = whisper.load_model("small")  # can use "tiny" if GPU is limited
result = model.transcribe(AUDIO_FILE)
transcript = result["text"]
print("Transcript:", transcript)


In [None]:
# Use file from OpenCV
cap = cv2.VideoCapture(VIDEO_FILE)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
print(f"Video frames: {frame_count}, FPS: {fps}")

frame_paths = []
for i in range(0, frame_count, int(fps*2)):  # pause, 2 sec each time 
    cap.set(cv2.CAP_PROP_POS_FRAMES, i)
    ret, frame = cap.read()
    if ret:
        frame_path = OUTPUT_DIR / f"frame_{i}.jpg"
        cv2.imwrite(str(frame_path), frame)
        frame_paths.append(frame_path)
cap.release()

print("Extracted frames:", frame_paths[:5])


In [None]:
# Combine 
data = {
    "transcript": [transcript],
    "mfcc_features": [mfcc_features.tolist()],
    "frame_paths": [list(map(str, frame_paths))]
}

df = pd.DataFrame(data)
df.to_csv(OUTPUT_DIR / "multimodal_dataset.csv", index=False)
df # Show what it looks like now to have a brief idea 