In [2]:
%load_ext cuml.accel
%load_ext cudf.pandas

cuML: Accelerator installed.


In [3]:
import rmm
rmm.reinitialize(
    pool_allocator=True,
    initial_pool_size=20<<30,   # 20 GiB pool (tweak as you like)
    managed_memory=False
)
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())

In [4]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import unicodedata
import warnings
import pickle
import logging
import librosa
warnings.simplefilter(action='ignore', category=FutureWarning)

Now that we have our imports, we want to import the data. This will be seen as de-pickling the Songs.pkl file, import to a DataFrame, and then accessing each song.

In [5]:
# load df
Songs = []
with open("/home/tyler/Downloads/kpop_project/create_model/local_Songs.pkl", 'rb') as f:
    Songs = pickle.load(f) 
data = [vars(song) for song in Songs]
df = pd.DataFrame(data)
df

Unnamed: 0,path,company,generation,artist,name
0,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,JYP,1,g.o.d,0%.mp3
1,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,JYP,1,g.o.d,134-14.mp3
2,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,JYP,1,g.o.d,20.mp3
3,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,JYP,1,g.o.d,21C 우리의 희망.mp3
4,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,JYP,1,g.o.d,5+4+1+5=15.mp3
...,...,...,...,...,...
5522,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,YG,5,BabyMonster,SHEESH.mp3
5523,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,YG,5,BabyMonster,Stuck In The Middle (7 ver.).mp3
5524,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,YG,5,BabyMonster,Stuck In The Middle (Remix).mp3
5525,/home/tyler/gdrive/K-pop Project 2024-5/K-pop ...,YG,5,BabyMonster,Stuck In The Middle.mp3


In [6]:
# need some logging to make skip work already done and to find failures
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s — %(levelname)s — %(message)s",
    handlers=[
        logging.FileHandler("/home/tyler/Downloads/kpop_project/logs/new_process_librosa.log"),
        logging.StreamHandler()
    ]
)

In [7]:
def process_librosa(audio, sr):
    # Extract MFCC features using librosa
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs.T, axis=0)  # Get mean of MFCC features

    # Chroma features
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_mean = np.mean(chroma.T, axis=0)

    # Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)

    # Zero-Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y=audio)
    zcr_mean = np.mean(zcr.T, axis=0)

    # Root Mean Square Energy
    rmse = librosa.feature.rms(y=audio)
    rmse_mean = np.mean(rmse.T, axis=0)

    # Spectral Centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
    spectral_centroid_mean = np.mean(spectral_centroid.T, axis=0)

    # Spectral Bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
    spectral_bandwidth_mean = np.mean(spectral_bandwidth.T, axis=0)

    # Spectral Flatness
    spectral_flatness = librosa.feature.spectral_flatness(y=audio)
    spectral_flatness_mean = np.mean(spectral_flatness.T, axis=0)

    # Tonnetz
    tonnetz = librosa.feature.tonnetz(y=audio, sr=sr)
    tonnetz_mean = np.mean(tonnetz.T, axis=0)

    # Combine features into a single feature vector
    features = np.concatenate((
        mfccs_mean, chroma_mean, spectral_contrast_mean,
        zcr_mean, rmse_mean, spectral_centroid_mean,
        spectral_bandwidth_mean, spectral_flatness_mean,
        tonnetz_mean
    ))
    return features

In [8]:
# # split songs into clips
# clip_duration = 30
# overlap = 10
# x_out = "/home/tyler/gdrive/K-pop Project 2024-5/K-pop Project/ML_inputs/features.pkl" 
# X = pickle.load(open(x_out, 'rb'))
# # X = []
# labels_out = "/home/tyler/gdrive/K-pop Project 2024-5/K-pop Project/ML_inputs/labels.pkl"
# labels = pickle.load(open(labels_out, 'rb'))
# # labels = []
# seen_out = "/home/tyler/gdrive/K-pop Project 2024-5/K-pop Project/ML_inputs/seen.pkl"
# seen = pickle.load(open(seen_out, 'rb'))
# # seen = set()
# completed = 0
# for row in df.itertuples():
#     if row.path in seen:
#         logging.info(f"Skipping {row.name} - {row.artist}, already processed.")
#         completed += 1
#     else:
#         # creates 30-second clips of audio with 10 seconds of audio; removes clips that are less than 30 seconds at the end (loses at most 9 seconds at the end)
#         y, sr = librosa.load(unicodedata.normalize('NFC', row.path))
#         clip_length = clip_duration * sr
#         hop_length = int(clip_length *(1 - (float(overlap) / clip_duration)))
#         end_spot = int(len(y) - clip_length + 1)
#         for start_sample in range(0, end_spot, hop_length):
#             clip = y[start_sample:start_sample + clip_length]
#             features = process_librosa(clip, sr)
#             # Append features and corresponding label
#             X.append(features)  # Append to list
#             labels.append(int(row.generation))
#         seen.add(row.path)
#         # keyboard interrupts will only occur before serializing processed data or after, never in the middle 
#         try:
#             pickle.dump(X, open(x_out, 'wb'))
#             pickle.dump(labels, open(labels_out, 'wb'))
#             pickle.dump(seen, open(seen_out, 'wb'))
#             completed += 1
#             logging.info(f"Processed and saved features for {row.name} - {row.artist}. {df.shape[0] - completed} songs remaining.")
#         except KeyboardInterrupt:
#             # Ignore Ctrl+C during vital section
#             print("KeyboardInterrupt ignored in vital section")   


In [12]:
def company_to_int(company: str):
    if company == 'JYP':
        return 7
    elif company == 'SM':
        return 9
    else: # YG
        return 11 

In [10]:
df['path'] = df['path'].str.replace('/home/tyler/gdrive/K-pop Project 2024-5/K-pop Project/music_files', '/home/tyler/Downloads/kpop_project/music_files')

In [13]:
# create company labels for the features

# split songs into clips
clip_duration = 30
overlap = 10

company_labels_out = "/home/tyler/Downloads/kpop_project/ML_inputs/company_labels.pkl"
# company_labels = pickle.load(open(labels_out, 'rb'))
company_labels = []
seen_out = "/home/tyler/Downloads/kpop_project/ML_inputs/company_label_creation_seen.pkl"
# seen = pickle.load(open(seen_out, 'rb'))
seen = set()
completed = 0
for row in df.itertuples():
    if row.path in seen:
        logging.info(f"Skipping {row.name} - {row.artist}, already processed company label.")
        completed += 1
    else:
        # creates 30-second clips of audio with 10 seconds of audio; removes clips that are less than 30 seconds at the end (loses at most 9 seconds at the end)
        y, sr = librosa.load(unicodedata.normalize('NFC', row.path))
        clip_length = clip_duration * sr
        hop_length = int(clip_length *(1 - (float(overlap) / clip_duration)))
        end_spot = int(len(y) - clip_length + 1)
        for start_sample in range(0, end_spot, hop_length):            
            company_labels.append(company_to_int(row.company))
        seen.add(row.path)
        # keyboard interrupts will only occur before serializing processed data or after, never in the middle 
        try:
            logging.info(f"Saving company labels for {row.name} - {row.artist}.")
            pickle.dump(company_labels, open(company_labels_out, 'wb'))
            pickle.dump(seen, open(seen_out, 'wb'))
            completed += 1
            logging.info(f"Processed and saved company labels for {row.name} - {row.artist}. {df.shape[0] - completed} songs remaining.")
        except KeyboardInterrupt:
            # Ignore Ctrl+C during vital section
            print("KeyboardInterrupt ignored in vital section")   


2025-06-27 15:35:24,933 — INFO — Saving company labels for 0%.mp3 - g.o.d.
2025-06-27 15:35:24,946 — INFO — Processed and saved company labels for 0%.mp3 - g.o.d. 5526 songs remaining.
2025-06-27 15:35:25,249 — INFO — Saving company labels for 134-14.mp3 - g.o.d.
2025-06-27 15:35:25,250 — INFO — Processed and saved company labels for 134-14.mp3 - g.o.d. 5525 songs remaining.
2025-06-27 15:35:25,420 — INFO — Saving company labels for 20.mp3 - g.o.d.
2025-06-27 15:35:25,422 — INFO — Processed and saved company labels for 20.mp3 - g.o.d. 5524 songs remaining.
2025-06-27 15:35:25,781 — INFO — Saving company labels for 21C 우리의 희망.mp3 - g.o.d.
2025-06-27 15:35:25,783 — INFO — Processed and saved company labels for 21C 우리의 희망.mp3 - g.o.d. 5523 songs remaining.
2025-06-27 15:35:26,218 — INFO — Saving company labels for 5+4+1+5=15.mp3 - g.o.d.
2025-06-27 15:35:26,220 — INFO — Processed and saved company labels for 5+4+1+5=15.mp3 - g.o.d. 5522 songs remaining.
2025-06-27 15:35:26,402 — INFO — Sa