In [None]:
import subprocess
import os
import re
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil
from pathlib import Path

import librosa
import librosa.display

In [None]:
df = pd.read_csv("***.csv", encoding='utf-8-sig')
df.head()

Path "0", "1" by match_status

In [None]:
src_dir = 'temp_wavs'

for label in ["0", "1"]:
    os.makedirs(os.path.join(src_dir, label), exist_ok=True)

for _, row in df.iterrows():
    filename = f"{row['artist']} - {row['title']}.wav"
    src_path = os.path.join(src_dir, filename)
    if not os.path.exists(src_path):
        print(f"[경고] {filename} 파일 없음")
        continue

    label = "1" if row["match_status"] == "match" else "0"
    dst_path = os.path.join(src_dir, label, filename)
    shutil.move(src_path, dst_path)

In [None]:
mismatch = Path('temp_wavs/0')
mis_wav_paths = sorted(mismatch.glob('*.wav'))
match = Path('temp_wavs/1')
mat_wav_paths = sorted(match.glob('*.wav'))

Path "0", "1", "2", "3" by emotion_pair

In [None]:
src_dir = "temp_wavs"

# 0~3 폴더 및 temp_wavs/ 내부의 .wav 파일들 모두 순회
search_dirs = [src_dir] + [os.path.join(src_dir, str(i)) for i in range(4)]

for folder in search_dirs:
    for fname in os.listdir(folder):
        if not fname.endswith(".wav"):
            continue

        filepath = os.path.join(folder, fname)

        row = df[df.apply(
            lambda r: f"{r['artist']} - {r['title']}.wav" == fname,
            axis=1
        )]

        if row.empty:
            print(f"[경고] {fname} 파일 없음")
            continue

        match_status = row.iloc[0]["match_status"]
        if match_status not in ["match", "mismatch"]:
            print(f"[경고] {match_status} 해당 없음")
            continue

        dst_label = "1" if match_status == "match" else "0"
        dst_path = os.path.join(src_dir, dst_label, fname)
        shutil.move(filepath, dst_path)

Analysis

- Tempo

In [None]:
def get_dominant_tempo_variance(audio_path):
    y, sr = librosa.load(audio_path)
    hop_length = 512

    onset_strength_seq = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    tempogram = librosa.feature.tempogram(onset_envelope=onset_strength_seq, sr=sr, hop_length=hop_length) # (num_bins, num_frames)

    tempos = librosa.tempo_frequencies(tempogram.shape[0], sr=sr, hop_length=hop_length) # tempo bin index → bpm
    dominant_indices = np.argmax(tempogram[1:], axis=0) + 1
    dominant_tempos = tempos[dominant_indices]

    dominant_tempos = dominant_tempos[(~np.isnan(dominant_tempos)) & (~np.isinf(dominant_tempos))]
    dominant_tempos = np.log2(dominant_tempos + 1e-6)

    if len(dominant_tempos) == 0:
        return 0.0

    dominant_tempo_variance = np.var(dominant_tempos)

    return dominant_tempo_variance

In [None]:
for path in tqdm(mat_wav_paths, desc="match"):
    filename = path.name.replace(".wav", "").strip()
    dominant_tempo_variance = get_dominant_tempo_variance(str(path))
    df.loc[df['fileName'] == filename, 'dominant_tempo_variance'] = dominant_tempo_variance

for path in tqdm(mis_wav_paths, desc="mismatch"):
    filename = path.name.replace(".wav", "").strip()
    dominant_tempo_variance = get_dominant_tempo_variance(str(path))
    df.loc[df['fileName'] == filename, 'dominant_tempo_variance'] = dominant_tempo_variance

In [None]:
def get_onset_tempogram_tempo(audio_path):
  # Load audio
  y, sr = librosa.load(audio_path)
  hop_length = 512

  # Onset strength envelope
  oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
  tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length) # (n_tempo_bin, n_time_frame)

  return tempogram

def get_active_bpm_variance(tempogram, threshold=0.5):
  active_bpm_per_frame = np.sum(tempogram >= threshold, axis=0)
  active_bpm_variance = np.var(active_bpm_per_frame)

  prev_indices = None
  change_counts = 0
  for t in range(tempogram.shape[1]):
      indices = set(np.where(tempogram[:, t] >= threshold)[0]) # threshold 넘는 템포 bin들의 인덱스
      if prev_indices is not None:
          change = len(indices.symmetric_difference(prev_indices))
          change_counts += change
      prev_indices = indices
  tempo_transition_complexity = change_counts / tempogram.shape[1]

  return tempo_transition_complexity, active_bpm_variance

In [None]:
from tqdm.auto import tqdm
for path in tqdm(mat_wav_paths, desc="match"):
    filename = path.name.replace(".wav", "").strip()
    tempogram = get_onset_tempogram_tempo(str(path))
    tempo_transition_complexity, active_bpm_variance = get_dominant_tempo(tempogram)
    df.loc[df['fileName'] == filename, 'tempo_transition_complexity'] = tempo_transition_complexity
    df.loc[df['fileName'] == filename, 'active_bpm_variance'] = active_bpm_variance

for path in tqdm(mis_wav_paths, desc="mismatch"):
    filename = path.name.replace(".wav", "").strip()
    tempogram = get_onset_tempogram_tempo(str(path))
    tempo_transition_complexity, active_bpm_variance = get_dominant_tempo(tempogram)
    df.loc[df['fileName'] == filename, 'tempo_transition_complexity'] = tempo_transition_complexity
    df.loc[df['fileName'] == filename, 'active_bpm_variance'] = active_bpm_variance

- RMS

In [None]:
def get_rms_variance(audio_path):
    y, sr = librosa.load(audio_path)

    rms = librosa.feature.rms(y=y).flatten() # (n_rms_value=1, n_time_frame)
    rms = rms[~np.isnan(rms) & ~np.isinf(rms)]

    if len(rms) == 0:
        return 0.0

    rms_var = np.var(rms)
    return rms_var

In [None]:
for path in tqdm(mat_wav_paths, desc="match"):
    filename = path.name.replace(".wav", "").strip()
    rms_variance = get_rms_variance(str(path))
    df.loc[df['fileName'] == filename, 'rms_variance'] = rms_variance

for path in tqdm(mis_wav_paths, desc="mismatch"):
    filename = path.name.replace(".wav", "").strip()
    rms_variance = get_rms_variance(str(path))
    df.loc[df['fileName'] == filename, 'rms_variance'] = rms_variance

- Pitch

In [None]:
def get_pitch_range(audio_path):
  y, sr = librosa.load(audio_path)

  f0, _, _ = librosa.pyin(
      y,
      fmin=librosa.note_to_hz('C2'),
      fmax=librosa.note_to_hz('C7'),
      sr=sr
  )

  # NaN 제거
  f0 = f0[~np.isnan(f0)]

  # IQR 기반 이상치 제거
  q1 = np.percentile(f0, 25)
  q3 = np.percentile(f0, 75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5 * iqr
  upper_bound = q3 + 1.5 * iqr
  
  f0_filtered = f0[(f0 >= lower_bound) & (f0 <= upper_bound)]
  pitch_range = np.max(f0_filtered) - np.min(f0_filtered)

  return np.max(f0_filtered), np.min(f0_filtered), pitch_range

In [None]:
for path in tqdm(mat_wav_paths, desc="match"):
    filename = path.name.replace(".wav", "").strip()
    pitch_high, pitch_low, pitch_range = get_pitch_range(str(path))
    df.loc[df['fileName'] == filename, 'pitch_high'] = pitch_high
    df.loc[df['fileName'] == filename, 'pitch_low'] = pitch_low
    df.loc[df['fileName'] == filename, 'pitch_range'] = pitch_range

for path in tqdm(mis_wav_paths, desc="mismatch"):
    filename = path.name.replace(".wav", "").strip()
    pitch_high, pitch_low, pitch_range = get_pitch_range(str(path))
    df.loc[df['fileName'] == filename, 'pitch_high'] = pitch_high
    df.loc[df['fileName'] == filename, 'pitch_low'] = pitch_low
    df.loc[df['fileName'] == filename, 'pitch_range'] = pitch_range

- Repetition

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_repetition_score(audio_path, threshold=0.8, hop_length=512):
    # 1. Load audio
    y, sr = librosa.load(audio_path)

    # 2. Extract chroma feature
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)  # (12, T)

    # 3. Cosine similarity between time frames
    ssm = cosine_similarity(chroma.T)  # (T, T)
    T = ssm.shape[0]

    # 4. Valid repeat: high similarity
    valid_region = ssm[np.triu_indices(T, k=1)]
    repeated_area_new = np.sum(valid_region >= threshold)
    total_possible = valid_region.size
    repetition_ratio_value_new = repeated_area_new / total_possible if total_possible > 0 else 0

    return repeated_area_new, repetition_ratio_value_new

In [None]:
for label in ["0", "1", "2", "3"]:
    dir_path = Path("temp_wavs") / label
    for path in tqdm(list(dir_path.glob("*.wav")), desc=f"Label {label}"):
        try:
            count, ratio = get_repetition_score(str(path))
            filename = path.name.replace(".wav", "").strip()
            row_idx = df[df["fileName"].str.replace(".wav", "").str.strip() == filename].index

            if len(row_idx) > 0:
                df.loc[row_idx, "repeat_area_size_new"] = count
                df.loc[row_idx, "repetition_ratio_new"] = ratio
            else:
                print(f"[경고] 실패: {filename}")
        except Exception as e:
            print(f"[오류] {path.name}: {e}")

- MFCC

In [None]:
def get_mfcc_summary(audio_path):
    y, sr = librosa.load(audio_path)
    
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)

    return mfcc_mean, mfcc_std

In [None]:
for path in tqdm(mat_wav_paths, desc="match"):
    filename = path.name.replace(".wav", "").strip()
    mfcc_mean, mfcc_std = get_mfcc_summary(str(path))
    row_idx = df.index[df['fileName'] == filename]
    if len(row_idx) == 1:
        for i in range(len(mfcc_mean)):
            df.at[row_idx[0], f'mfcc_{i}_mean'] = mfcc_mean[i]
            df.at[row_idx[0], f'mfcc_{i}_std'] = mfcc_std[i]

for path in tqdm(mis_wav_paths, desc="mismatch"):
    filename = path.name.replace(".wav", "").strip()
    mfcc_mean, mfcc_std = get_mfcc_summary(str(path))
    row_idx = df.index[df['fileName'] == filename]
    if len(row_idx) == 1:
        for i in range(len(mfcc_mean)):
            df.at[row_idx[0], f'mfcc_{i}_mean'] = mfcc_mean[i]
            df.at[row_idx[0], f'mfcc_{i}_std'] = mfcc_std[i]

- Centroid

In [None]:
def get_spectral_centroid(audio_path):
    y, sr = librosa.load(audio_path)

    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    centroid_mean = np.mean(centroid)
    centroid_std = np.std(centroid)
    return centroid_mean, centroid_std

In [None]:
for path in tqdm(mat_wav_paths, desc="match"):
    filename = path.name.replace(".wav", "").strip()
    centroid_mean, centroid_std = get_spectral_centroid(str(path))
    row_idx = df.index[df['fileName'] == filename]
    df.loc[df['fileName'] == filename, 'centroid_mean'] = centroid_mean
    df.loc[df['fileName'] == filename, 'centroid_std'] = centroid_std


for path in tqdm(mis_wav_paths, desc="mismatch"):
    filename = path.name.replace(".wav", "").strip()
    centroid_mean, centroid_std = get_spectral_centroid(str(path))
    df.loc[df['fileName'] == filename, 'centroid_mean'] = centroid_mean
    df.loc[df['fileName'] == filename, 'centroid_std'] = centroid_std