In [None]:
# Template to solve the problem of editing a sequence of images to match
# the spectrogram of a song.
# This is but a very basic idea, needs plenty of improvement.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from IPython.display import Audio
from scipy.fft import fft, ifft, fft2, ifft2, fftshift, ifftshift
from skimage.io import imread
from cv2 import VideoWriter, VideoWriter_fourcc
from skimage.transform import resize
from skimage import util

In [2]:
# AUX function to change real and imaginary pairs into magnitude and phase pairs
def real_imag2magn_phas(real, imag):
    magn = np.sqrt(real*real + imag*imag)
    phas = np.arctan2(imag, real)
    return magn, phas

In [3]:
# AUX function to change magnitude and phase pairs into real and imaginary pairs
def magn_phas2real_imag(magnitude, phase):
    real = magnitude * np.cos(phase)
    imag = magnitude * np.sin(phase)
    return real, imag

# READ AUDIO FILE

In [4]:
# Read audio file
rate, audio_data = wavfile.read("GoT_intro.wav")
N = len(audio_data)
L = len(audio_data) / rate
print(f"Rate: {rate} Hz")
print(f"Length (n): {N}")
print(f"Length (s): {L:.2f}")

# Spit channels
channel1 = audio_data[:, 0]
channel2 = audio_data[:, 1]

Rate: 44100 Hz
Length (n): 4485120
Length (s): 101.70


In [5]:
# Create a sliding window representation (split signal into N chunks of 1024 samples each, stride=100)
stride_size = 256
def slice_signal(in_signal):
    return(util.view_as_windows(in_signal, window_shape=(512,), step=stride_size))

In [6]:
# Create a sliding window representation (split channel1 into N chunks of 1024 samples each)
intervals_chan1 = slice_signal(channel1)
intervals_chan2 = slice_signal(channel2)
print(f"Sliced channel 1: {intervals_chan1.shape}")
print(f"Sliced channel 2: {intervals_chan2.shape}")

n_intervals = intervals_chan1.shape[0]

Sliced channel 1: (17519, 512)
Sliced channel 2: (17519, 512)


In [7]:
# Audio spectra
X_chan1 = fft(intervals_chan1, axis=1)
magn_chan1, phas_chan1 = real_imag2magn_phas(X_chan1.real, X_chan1.imag)

X_chan2 = fft(intervals_chan2, axis=1)
magn_chan2, phas_chan2 = real_imag2magn_phas(X_chan2.real, X_chan2.imag)

# READ IMAGE

In [8]:
# Load image
img = imread('GoT_image.jpg')
img = resize(img, (512, 512))
n_rows, n_cols, n_channs = img.shape
print(f"Number of rows: {n_rows}")
print(f"Number of columns: {n_cols}")
print(f"Number of channels: {n_channs}")
print(img.min())
print(img.max())

Number of rows: 512
Number of columns: 512
Number of channels: 3
1.8175910501454636e-06
0.9999745462455002


In [9]:
# Image RGB spectra
IMG = fft2(img)

magn_IMG, phas_IMG = real_imag2magn_phas(IMG.real, IMG.imag)

# Now combine them

In [10]:
# AUX function to Modulate image with sound spectrum
# Esta es la función en la que deben modelar su diseño
def modulate(IMG, modulator):
    img_out = np.zeros_like(IMG)
    for it_chann in range(n_channs):
        img_out[:, :, it_chann] = np.multiply(modulator, IMG[:, :, it_chann])
    return(img_out)

In [11]:
# Genera cada frame del nuevo video
out_images = []
for ind_interval in range(n_intervals):
    modulated = modulate(magn_IMG, magn_chan1[ind_interval, :])
    modulated = modulate(modulated, magn_chan2[ind_interval, :].T)
    img_mod = np.real(ifft2(modulated))
    """ # Normalización, puede o no ser requerida
    img_mod -= img_mod.min()
    if img_mod.max() != 0:
        img_mod /= img_mod.max()
    """
    img_mod = np.round(255 * img_mod).astype('uint8') # Formato para guardar en disco
    out_images.append(img_mod)

In [16]:
# Save sequence of frames to disk
out = VideoWriter('frame_seq.mp4', VideoWriter_fourcc(*'MP4V'), 86.12, (512, 512))
for im in out_images:
    out.write(im)
out.release()

Once on disk, can combine the sequence of frames with audio into final video. Use from command line:
ffmpeg -i frame_seq.mp4 -i original_audio.mp3 -codec copy -shortest final_video.mp4