# Custom DFT for hi-res spectrograms

## install, import, init, upload

In [0]:
%%capture
!apt -y install libcusparse8.0 libnvrtc8.0 libnvtoolsext1
!ln -snf /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so.8.0 /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so
!pip install cupy-cuda80

In [0]:
%%capture
!apt-get install ffmpeg # sometimes need runtime restart

In [2]:
from google.colab import files
from glob import glob
from PIL import Image
import os
import numpy as np
import cupy as cp
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.animation as manimation
import matplotlib.pyplot as plt
import zipfile
# import sys
# plt.rcParams['figure.figsize'] = [20, 20]
# plt.rcParams['figure.figsize'] = [30, 30]
plt.rcParams['figure.figsize'] = [10, 10]
print(manimation.writers.list()) # should contains 'ffmpeg'

['ffmpeg', 'ffmpeg_file', 'html']


In [0]:
def spectrogram_column(chunk, T, F):
    E = cp.array(np.exp(-2j * np.pi * F * T), dtype=np.complex64) # complex64: Complex number, represented by two 32-bit floats (real and imaginary components)

    # maybe multiply by window here
    
    corr = cp.dot(E, chunk)
    magnitude = cp.abs(corr)
#     print(magnitude.shape)
#     column = cp.asnumpy(magnitude.T)
    column = magnitude
    return column

In [4]:
spectrogram_column(cp.random.random(1000),  rate=44100).max()

array(199.67696038)

# Make Video

In [0]:
uploaded = files.upload()
for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

In [1]:
!ls

datalab  sample_data


In [4]:
from scipy.io import wavfile
rate, track = wavfile.read('mkns_44k_8s_a4561045.wav') # now supports only mono .wav files
print(f'rate {rate}')

wav_dtype = track.dtype

wav_max = np.iinfo(wav_dtype).max
wav_min = np.iinfo(wav_dtype).min

track = track.astype(np.float32)
track = (track - wav_min) * 2 / (wav_max - wav_min) -1 # normalise -1..1

print(track.dtype)
# track = x[:20000]
# left_channel  = x[:, 0]
# right_channel = x[:, 1]
# track = left_channel
# average left and right channels to mono signal (reaaly bad, don't do this)
# track = (left_channel + right_channel) / 2 
n = len(track)
seconds = n/rate 
print(f'{seconds} seconds')
t = np.linspace(0, seconds, n, dtype=track.dtype)
print(t.dtype)
# plt.grid()
# plt.plot(t, track, 'k-', linewidth=0.1)
# track_chunks = track_to_chunks(track)
# print(track_chunks.shape)

rate 44100
float32
8.0 seconds
float32


In [5]:
fps = 60
frames = fps * seconds
print(f'{frames} frames')

480.0 frames


In [6]:
# use supersolver (lite-solver) to find nperseg and step
nperseg = 10_000
step    = 715 # abap
noverlap = nperseg - step

# if 1 column per frame then number_of_columns = number_of_frames = fps * seconds
print((n - noverlap) // step, fps * seconds)

480 480.0


In [7]:
x = track
shape   = ((x.shape[0] - noverlap) // step, nperseg)
strides = (step * x.strides[-1], x.strides[-1])
audio_rolled = cp.array(np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides))
audio_rolled.shape

(480, 10000)

In [0]:
FFMpegWriter = manimation.writers['ffmpeg']
metadata = dict(
    title='Movie Test', 
    artist='Matplotlib',
)
writer = FFMpegWriter(
    fps=60,
    # codec='ffv1', default codec is 'h264'
    # bitrate=
    metadata=metadata
)

frame_width  = 1920//4
frame_height = 1080//4

frame = cp.zeros((frame_width, frame_height))

chunk = audio_rolled[0]
t = np.linspace(0, nperseg / rate, nperseg, dtype=chunk.dtype)
f = np.geomspace(30, nperseg // 2, frame_height) # second arg: nperseg//2 (Nyquist limit)    
T, F = np.meshgrid(t, f)

t_frame = np.linspace(0, seconds, frame_width)
f_frame = np.geomspace(30, nperseg // 2, frame_height) # second arg: nperseg//2 (Nyquist limit)    

T_frame, F_frame = np.meshgrid(t_frame, f_frame)

# frame = cp.random.random((1920, 1080))
fig = plt.figure(figsize=(frame_width / 100, frame_height / 100))

print(frame.dtype)

# fig.subplots_adjust(left=0.1, bottom=0, right=0.9, top=1, wspace=None, hspace=None)
fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=None, hspace=None)

im = plt.pcolormesh(
    T_frame,
    F_frame, 
    cp.asnumpy(frame.T),
    cmap='viridis',
    vmin=0,
    vmax=300,
#     origin='lower',
)


sa = 261.63 # C4 Hz
semitones_range = np.arange(13)
freqlines = sa * 2**(semitones_range/12)
plt.hlines(freqlines, xmin=0, xmax=seconds, linewidth=0.3, color='w')
# notes = ['Sa', 're', 'Re', 'ga', 'Ga', 'ma', 'Ma', 'Pa', 'dha', 'Dha', 'ni', 'Ni', 'Sa\'']
plt.axvline(x=seconds/2, linewidth=0.5, color='w')

plt.semilogy()
plt.grid(False)


# initial half-width rendering
for i, chunk in enumerate(audio_rolled[:frame_width//2]):
    print(f'{i+1}/{len(audio_rolled)}')
    frame = cp.roll(frame, -1, axis=0)
    frame[-1] = spectrogram_column(chunk, T, F)

with writer.saving(fig, 'writer_test.mp4', dpi=100):
    for i, chunk in enumerate(audio_rolled[frame_width//2:]):
#     for i, chunk in enumerate(audio_rolled):
        print(f'{i+1}/{len(audio_rolled)}')
        frame = cp.roll(frame, -1, axis=0)
        frame[-1] = spectrogram_column(chunk, T, F)
        # im.set_array(cp.asnumpy(frame.T))
        im.set_array(cp.asnumpy(frame.T)[:-1,:-1].ravel())
        writer.grab_frame()
        
    # post half-frame frames
    for i in range(frame_width//2):
        print(f'{i+1}/{len(audio_rolled)}')
        frame = cp.roll(frame, -1, axis=0)
        frame[-1] = 0
        im.set_array(cp.asnumpy(frame.T)[:-1,:-1].ravel())
        writer.grab_frame()

In [21]:
(np.ones(frame.T.shape) * np.arange(frame_width)).shape
F.shape

(1080, 10000)

In [0]:
files.download('writer_test.mp4')

In [91]:
!ls -alh

total 744K
drwxr-xr-x 1 root root 4.0K Aug 21 11:22 .
drwxr-xr-x 1 root root 4.0K Aug 21 09:46 ..
drwx------ 4 root root 4.0K Aug 21 09:47 .cache
drwxr-xr-x 1 root root 4.0K Aug 21 09:47 .config
drwxr-xr-x 3 root root 4.0K Aug 21 10:34 .cupy
lrwxrwxrwx 1 root root    8 Aug 15 20:56 datalab -> /content
drwxr-xr-x 4 root root 4.0K Aug 21 09:47 .forever
drwxr-xr-x 5 root root 4.0K Aug 21 09:47 .ipython
drwx------ 3 root root 4.0K Aug 21 09:47 .local
-rw-r--r-- 1 root root 690K Aug 21 10:09 mkns_44k_8s_a4561045.wav
drwx------ 3 root root 4.0K Aug 21 10:31 .nv
drwxr-xr-x 2 root root 4.0K Aug 15 20:56 sample_data
-rw-r--r-- 1 root root 6.8K Aug 21 12:46 writer_test.mp4


In [43]:
# https://stackoverflow.com/questions/684015
!ffprobe -v error -of flat=s=_ -select_streams v:0 -show_entries stream=height,width writer_test.mp4

streams_stream_0_width=1920
streams_stream_0_height=1080


## download
короче Colab не дает скачать large files.
- Нужно либо заливать их на google drive
- либо качать маленькими файлами. 
- Либо заливать куда-то еще / на свой сайт / а потом уже оттуда выкачивать.

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
file_to_download = 'spectrogram.zip'
# files.download(file_to_download)
# Create & upload a file.
uploaded = drive.CreateFile({'title': file_to_download})
uploaded.SetContentFile(file_to_download)
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))