<a href="https://colab.research.google.com/github/tuomaseerola/emr/blob/main/Ch08_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example feature extraction
Grab audio files and extract interesting features from them.

EMR book draft.

In [None]:
import numpy as np
import librosa
import librosa.display
from tqdm import tqdm
import pandas as pd
import os
import csv
import sys
import warnings
warnings.filterwarnings('ignore')


## Get files straight from OSF
Get all 110 audio in the OSF project https://osf.io/yn7vg which are stored in the file `Set2.zip`.

In [None]:
# Fetch Set 2 mp3 files (110)
!pip install osfclient
!osf -p p6vkg fetch Set2.zip

In [None]:
# unzip the archive
!unzip -oq Set2.zip

## Prepare output file

In [None]:
# Prepare the extraction by writing the output file header.

SR=22050

# Write header
header = ["File", "RMS", "sp_centr", "spec_bw", "spec_rolloff", "spec_zcr", "MFCC1", "MFCC2", "MFCC3", "MFCC4", "MFCC5", "MFCC6", "MFCC7", "MFCC8", "MFCC9", "MFCC10", "MFCC11", "MFCC12", "MFCC13", "MFCC14", "MFCC15", "MFCC16", "MFCC17", "MFCC18", "MFCC19", "MFCC20"]
f = open("dataset_feature.csv", "w")
writer = csv.DictWriter(f, fieldnames=header)
writer.writeheader()
f.close()

## Extract features

In [None]:
sourcefolder="set2"
import natsort
dirlist = os.listdir(sourcefolder)
dirlists = natsort.natsorted(dirlist,reverse=False)
for fn in tqdm(dirlists): # "dataset/"
    if (fn[-3:] != "mp3"):
        continue
    y, sr = librosa.load(os.path.join(sourcefolder,fn), sr=SR, mono=True)
    arr = list(y)
    arr.insert(0, fn)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    to_append = f'{fn} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}' # {np.mean(chroma_stft)}
    for e in mfcc:
        to_append += f' {np.mean(e)}'

    file = open('dataset_feature.csv', 'a', newline='')
    with file:
        writer2 = csv.writer(file)
        writer2.writerow(to_append.split())



In [None]:
# read file and plot descriptives
d=pd.read_csv('dataset_feature.csv')
d.head()

In [None]:
d.boxplot(column='RMS')