In [1]:
import torchaudio
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report

In [2]:
dataset = torchaudio.datasets.SPEECHCOMMANDS(".", download=True)

100%|██████████| 2.26G/2.26G [01:16<00:00, 31.7MB/s]


In [3]:
type(dataset)

In [4]:
len(dataset)

105829

In [5]:
from torch.utils.data import Subset

In [6]:
subset_indices = list(range(5000))

In [7]:
len(subset_indices)

5000

In [8]:
subset = Subset(dataset, subset_indices)

In [9]:
len(subset)

5000

In [10]:
subset[0]

(tensor([[-0.0658, -0.0709, -0.0753,  ..., -0.0700, -0.0731, -0.0704]]),
 16000,
 'backward',
 '0165e0e8',
 0)

In [12]:
from IPython.display import Audio

In [13]:
Audio('/content/SpeechCommands/speech_commands_v0.02/bird/00970ce1_nohash_0.wav')

In [14]:
def extract_features(waveform, sample_rate, n_mfcc=13):
    mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate,n_mfcc=n_mfcc)
    mfcc = mfcc_transform(waveform)
    return mfcc.mean(dim=2).squeeze().numpy()

In [15]:
X = []
y = []
for waveform, sample_rate, label, *_ in subset:
    features = extract_features(waveform, sample_rate)
    X.append(features)
    y.append(label)



In [16]:
X[0]

array([-2.7327362e+02,  9.0470833e+01, -1.1053803e+01, -3.2581985e+01,
       -8.7922955e+00,  1.9914433e+01, -6.8636770e+00, -2.0679297e+00,
        1.3728560e+00,  3.0032806e+00, -2.4311161e-02,  5.7079357e-01,
        1.7813971e+00], dtype=float32)

In [17]:
y[0]

'backward'

In [18]:
X = np.array(X)
y = np.array(y)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [21]:
y_pred = clf.predict(X_test)

In [22]:
y_pred

array(['bird', 'bed', 'bed', 'bed', 'backward', 'backward', 'backward',
       'bed', 'bed', 'backward', 'bed', 'backward', 'backward', 'bed',
       'bed', 'bed', 'backward', 'bed', 'bed', 'backward', 'backward',
       'bed', 'bed', 'bird', 'bird', 'backward', 'bed', 'backward',
       'backward', 'backward', 'bed', 'backward', 'bed', 'bird', 'bed',
       'bed', 'backward', 'bird', 'bird', 'bed', 'bed', 'bird',
       'backward', 'backward', 'backward', 'bird', 'bird', 'backward',
       'bird', 'bed', 'backward', 'backward', 'backward', 'bed',
       'backward', 'bird', 'backward', 'bed', 'bird', 'bed', 'backward',
       'backward', 'bed', 'bird', 'backward', 'bird', 'bed', 'backward',
       'bird', 'bird', 'bed', 'bed', 'backward', 'backward', 'backward',
       'bed', 'bed', 'bird', 'backward', 'backward', 'bed', 'bed',
       'backward', 'bed', 'bird', 'backward', 'bird', 'backward',
       'backward', 'bird', 'backward', 'backward', 'bed', 'backward',
       'backward', 'bird

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    backward       0.69      0.68      0.69       358
         bed       0.69      0.71      0.70       382
        bird       0.61      0.59      0.60       260

    accuracy                           0.67      1000
   macro avg       0.66      0.66      0.66      1000
weighted avg       0.67      0.67      0.67      1000

