# Preprocessing the mp3

In [5]:
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations
from pydub import AudioSegment
import os
import IPython.display as ipd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Convert mp3 to numpy arrays

In [13]:
mp3_paths = list()
for f_year in os.listdir(".mp3"):
    files = os.listdir(f".mp3/{f_year}")
    for f in files:
        mp3_paths.append( f".mp3/{f_year}/{f}")
len(mp3_paths)

7000

In [14]:
mp3_paths[:5]

['.mp3/2013/5RCD9gtzRdLjxcm0EJfic2.mp3',
 '.mp3/2013/6gUcmw16g0gACKICSTIowi.mp3',
 '.mp3/2013/6RMpR8v0WmlIhChhua2hso.mp3',
 '.mp3/2013/3U4isOIWM3VvDubwSI3y7a.mp3',
 '.mp3/2013/2hYarNjMU7pZVY9QsIJIJ3.mp3']

In [16]:
def load_sound(file, start):
    sound = AudioSegment.from_file(file,format="mp3",start_second=start, duration=5)
    sound = sound.set_frame_rate(16000)
    sound = sound.set_channels(1)
    samples = sound.get_array_of_samples()
    fp_arr = np.array(samples).T.astype(np.float32)
    return fp_arr

In [25]:
# transform mp3 to np.array and save it into the folder .npy/year/file
for i in trange(len(mp3_paths)):
    path = mp3_paths[i]
    [_, year, id] = path.split("/")

    folder = f".npy/{year}"
    os.makedirs(name=folder, exist_ok=True)

    for segment in range(0, 30, 5):
        filename = f"{id[:-4]}_{segment}"
        arr = load_sound(path, segment)
        with open(os.path.join(folder, f"{filename}.npy"), 'wb') as f:
            np.save(f, arr)

  0%|          | 0/7000 [00:00<?, ?it/s]

In [24]:
arr = np.load(".npy/2013/2hYarNjMU7pZVY9QsIJIJ3_0.npy")

ipd.Audio(arr, rate=16000)

## Extract audio features with autoencoder vgg

In [10]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the model.
model = hub.load('https://tfhub.dev/google/vggish/1')

In [13]:
def vgg_encode(arr):
    embeddings = model(arr)
    embeddings.shape.assert_is_compatible_with([None, 128])
    return embeddings

In [14]:
npy_paths = list()
for f_year in os.listdir(".npy"):
    files = os.listdir(f".npy/{f_year}")
    for f in files:
        npy_paths.append( f".npy/{f_year}/{f}")
len(npy_paths)

42000

In [19]:
# encode np.array to 128 tensor and save it into the folder .encoded/year/file
for i in trange(len(npy_paths)):
    path = npy_paths[i]
    arr = np.load(path)
    [_, year, filename] = path.split("/")

    folder = f".encoded/{year}"
    os.makedirs(name=folder, exist_ok=True)
    
    arr = vgg_encode(arr)
    with open(os.path.join(folder, filename[:-4]), 'wb') as f:
        np.save(f, arr)

  0%|          | 0/42000 [00:00<?, ?it/s]

## Create dataset with audio features for classification

In [20]:
encoded_paths = list()
for f_year in os.listdir(".encoded"):
    files = os.listdir(f".encoded/{f_year}")
    for f in files:
        encoded_paths.append( f".encoded/{f_year}/{f}")
len(encoded_paths)

42000

In [26]:
# load each encoded tensor into the final csv file, adding the label of the year
for i in trange(len(encoded_paths)):
    path = encoded_paths[i]
    arr = np.load(path)
    [_, year,_] = path.split("/")
    df = pd.DataFrame(arr)
    df["label"] = year
    df.to_csv('.csv/03_datapoints.csv', mode='a', header=False,index=False)


  0%|          | 0/42000 [00:00<?, ?it/s]

In [40]:
df = pd.read_csv(".csv/03_datapoints.csv", header=None)
df.shape

(209999, 129)

In [41]:
df.columns = [f"dp_{i}" for i in range(128)] + ["label"]
df.head()

Unnamed: 0,dp_0,dp_1,dp_2,dp_3,dp_4,dp_5,dp_6,dp_7,dp_8,dp_9,...,dp_119,dp_120,dp_121,dp_122,dp_123,dp_124,dp_125,dp_126,dp_127,label
0,-0.851101,-0.133359,0.411688,-0.506884,-0.62475,-1.183555,-0.007701,-0.193852,-2.310815,-0.681502,...,-0.171966,-0.198043,-0.392601,-0.921793,-0.181319,0.049851,-0.743607,0.221477,0.06114,2013
1,-0.942043,-0.028752,0.842339,-0.91713,-1.013945,-1.175337,-0.031011,-0.134986,-2.647296,-0.773516,...,-0.491768,-0.181433,-0.631909,-1.186748,-0.269359,-0.129906,-0.844575,0.131751,-0.067637,2013
2,-0.965285,-0.201948,0.425197,-0.839285,-0.732815,-1.359382,0.186973,-0.21492,-2.731281,-0.73837,...,-0.064863,-0.139382,-0.451683,-1.054647,-0.422945,-0.04687,-0.828963,0.277579,-0.099526,2013
3,-1.257392,-0.270322,0.366864,-0.904969,-0.922917,-1.498431,0.406486,-0.042352,-2.874939,-0.704583,...,-0.132417,-0.038785,-0.382238,-1.255367,-0.557412,-0.068375,-0.844525,0.311174,0.154526,2013
4,-1.411341,-0.296022,0.336932,-0.924695,-0.962841,-1.601036,0.384875,-0.075125,-2.976923,-0.78572,...,-0.108609,-0.054045,-0.364247,-1.421688,-0.612453,-0.09307,-0.908785,0.356414,0.239415,2013


In [43]:
# override with headers
df.to_csv(".csv/03_datapoints.csv", index_label=False, index=False)