In [None]:
import os
import re

import numpy as np
from matplotlib import pyplot as plt
from scipy import signal
from scipy.io import wavfile
from pydub import AudioSegment

import simfile
from simfile.notes import NoteData, NoteType
from simfile.timing import Beat, TimingData
from simfile.timing.engine import TimingEngine

In [None]:
test_simfile_dir = r"C:\Games\ITGmania\Songs\In The Groove Rebirth\Cherry Blossom Whirlwind"

test_audio_path = None
test_simfile_path = None
for f in os.listdir(test_simfile_dir):
    if os.path.splitext(f)[1] in ['.ssc', '.sm']:
        if (test_simfile_path is None) or (os.path.splitext(test_simfile_path)[1] == '.sm'):
            test_simfile_path = os.path.join(test_simfile_dir, f)
print(test_simfile_path)

test_simfile = simfile.open(test_simfile_path)
chart = test_simfile.charts[0]
if not hasattr(chart, 'music') or chart.music is None:
    test_audio_path = os.path.join(test_simfile_dir, test_simfile.music)
else:
    test_audio_path = os.path.join(test_simfile_dir, chart.music)
print(test_audio_path)

engine = TimingEngine(TimingData(test_simfile, chart))

In [None]:
audio_ext = os.path.splitext(test_audio_path)[1]
audio = AudioSegment.from_file(test_audio_path, format=audio_ext[1:])
audio_data = np.array(audio.get_array_of_samples())

# https://stackoverflow.com/questions/53633177/how-to-read-a-mp3-audio-file-into-a-numpy-array-save-a-numpy-array-to-mp3
if audio.channels == 2:
    audio_data = audio_data.reshape((-1, 2))
audio_data = audio_data / 2**15

In [None]:
print(audio_data.shape)
print(audio.frame_rate)

# https://stackoverflow.com/questions/44787437/how-to-convert-a-wav-file-to-a-spectrogram-in-python3
# https://stackoverflow.com/questions/47954034/plotting-spectrogram-in-audio-analysis
window_ms = 5
step_ms = 1
eps = 1e-9

nperseg = int(audio.frame_rate * window_ms * 1e-3)
noverlap = nperseg - int(audio.frame_rate * step_ms * 1e-3)
frequencies, times, spectrogram = signal.spectrogram(
    audio_data[:, 0],
    fs=audio.frame_rate,
    window='hann',
    nperseg=nperseg,
    noverlap=noverlap,
    detrend=False
)
# frequencies, times, spectrogram = signal.spectrogram(a, fs=1000)
# print(spectrogram[:5, :5])
splog = np.log2(spectrogram + eps)
fig = plt.figure(figsize=(30, 6))
plt.pcolormesh(times, frequencies, splog)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()

In [None]:
fingerprint_sec = 0.10
actual_step = (nperseg - noverlap) / audio.frame_rate
fingerprint_size = 2 * int(0.5 * round(fingerprint_sec / actual_step))
fingerprint_times = np.arange(-fingerprint_size // 2, fingerprint_size // 2) * actual_step
frequency_emphasis_factor = 3000 # None

# print(audio_data.shape)
# print(frequencies.shape)
# print(spectrogram.shape)
# print(nperseg)
# print(times[:5])
# print(actual_step)
# print(fingerprint_size)

b = 0
acc = np.zeros((frequencies.size, fingerprint_size))
digest = np.zeros((0, fingerprint_size))
while True:
    t = engine.time_at(b)
    b += 1
    if (t < 0):
        continue
    if (t > audio.duration_seconds):
        break

    t_s = max(0,                   int(t / actual_step - fingerprint_size * 0.5))
    t_f = min(audio_data.shape[0], int(t / actual_step + fingerprint_size * 0.5))
    if (t_f - t_s != fingerprint_size):
        # Not enough data at this beat tbh
        continue
    
    # print(f'{t}: {t_s} -> {times[t_s]}, {t_f} -> {times[t_f]}')
    frequency_weights = 1
    if frequency_emphasis_factor is not None:
        frequency_weights = np.tile(frequencies * np.exp(-frequencies / frequency_emphasis_factor), [fingerprint_size, 1]).T
    spfilt = splog[:, t_s:t_f] * frequency_weights
    acc += spfilt
    digest = np.vstack([digest, np.sum(spfilt, axis=0)])

    if b in []:
        acc_log = splog[:, t_s:t_f]
        fig = plt.figure(figsize=(6, 6))
        plt.pcolormesh(fingerprint_times, frequencies, acc_log)
        plt.ylabel('Frequency [Hz]')
        plt.xlabel('Time [sec]')
        plt.show()
    


In [None]:
fig = plt.figure(figsize=(6, 6))
plt.pcolormesh(fingerprint_times, frequencies, acc)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()

fig = plt.figure(figsize=(6, 6))
plt.pcolormesh(fingerprint_times, np.arange(digest.shape[0]), digest)
plt.ylabel('Beat Index')
plt.xlabel('Time [sec]')
plt.show()

In [None]:
# Loudest point of attack
time_edge_kernel = [
    [1, 3, 10, 3, 1],
    [1, 3, 10, 3, 1],
    [1, 3, 10, 3, 1],
    [1, 3, 10, 3, 1],
    [1, 3, 10, 3, 1]
]
time_edge_offset = 0.000
if True:
    # Leading edge of attack
    time_edge_kernel = [
        [1, 3, 10, 0, -10, -3, -1],
        [1, 3, 10, 0, -10, -3, -1],
        [1, 3, 10, 0, -10, -3, -1],
        [1, 3, 10, 0, -10, -3, -1],
        [1, 3, 10, 0, -10, -3, -1]
    ]
    time_edge_offset = 0.002

acc_edge = signal.convolve2d(acc, time_edge_kernel, mode='same', boundary='wrap')
# acc_edge = signal.convolve2d(digest, time_edge_kernel, mode='same', boundary='wrap')
acc_edge_sum = np.sum(acc_edge, axis=0)
fingerprint_times = np.arange(-fingerprint_size // 2, fingerprint_size // 2) * actual_step
sync_bias = fingerprint_times[np.argmax(acc_edge_sum)] + time_edge_offset
if abs(sync_bias - 0.009) < abs(sync_bias):
    probable_bias = '+9ms'
else:
    probable_bias = 'null'

print(f'Sync bias: {sync_bias:0.3f}')

plt.pcolormesh(fingerprint_times, frequencies, acc)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.plot(np.ones(np.shape(frequencies)) * sync_bias, frequencies, 'r-')
plt.show()

fig = plt.figure(figsize=(6, 6))
plt.pcolormesh(fingerprint_times, np.arange(digest.shape[0]), digest)
plt.ylabel('Beat Index')
plt.xlabel('Time [sec]')
plt.plot(np.ones(np.shape(digest)[0]) * sync_bias, np.arange(digest.shape[0]), 'r-')
plt.title(f'Sync fingerprint for {test_simfile.artist} - "{test_simfile.title}"\nDerived sync bias: {sync_bias:0.3f} (probably {probable_bias})')
plt.show()

acc_edge_sum = np.sum(acc_edge, axis=0)
plt.plot(fingerprint_times, acc_edge_sum)
plt.show()