## Подключение всех необходимых библиотек

In [27]:
import os

import librosa
import numpy as np
import pandas as pd
import tqdm
from torch import nn
from torch.functional import F
from sklearn.model_selection import train_test_split

# Hyperparameters

In [28]:
N_MFCCS = 25

# Обработка аудио файлов и создание csv файла (датасета) с признаками

In [29]:
header = 'chroma_stft rms spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(N_MFCCS):
    header += f' mfcc{i}'
labeled_dataset = pd.read_csv("train_gt (1).csv")

## 1. Заполнение csv файла данными

In [49]:
if not os.path.exists("train_dataset.csv") or \
    pd.read_csv("train_dataset.csv").shape != N_MFCCS:
    for folder in ["train", "test"]:
        columns = (header if folder == "test" else header + " label").split()
        features = pd.DataFrame(columns=columns)
        for filename in tqdm.tqdm(os.listdir(folder), desc=folder):
            songname = os.path.join(folder, filename)
            y, sr = librosa.load(songname, mono=True)
            rms = librosa.feature.rms(y=y)
            chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
            spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
            spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
            rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            zcr = librosa.feature.zero_crossing_rate(y)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCCS)
            
            feature_row_data = [np.mean(chroma_stft), np.mean(rms), np.mean(spec_cent), np.mean(spec_bw), np.mean(rolloff), np.mean(zcr)]
            for e in mfcc:
                feature_row_data.append(np.mean(e))
            if folder == "train":
                feature_row_data.append(bool(labeled_dataset[labeled_dataset["Filename"] == filename]["Label"].values[0]))
            features.loc[filename] = pd.Series(feature_row_data, index=columns)
        features.to_csv(f"{folder}_dataset.csv")

train: 100%|██████████| 8803/8803 [07:57<00:00, 18.45it/s]
test: 100%|██████████| 2870/2870 [02:20<00:00, 20.46it/s]


## Обработка датасета и разделение на тренировочную и тестовую выборки

In [50]:
df = pd.read_csv("train_dataset.csv", index_col=0)
X = df.drop(columns=["label"])
y = df["label"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
import torch.utils
import torch.utils.data


class MyDataset(torch.utils.data.Dataset):
        
    def __init__(self, features, labels, device=torch.device("cpu")):
        self.features = torch.tensor(features.values, dtype = torch.float32).to(device)
        self.labels = torch.tensor(labels.values, dtype = torch.float32).to(device)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, ind):
        return self.features[ind], self.labels[ind]

In [52]:
import torch.utils
import torch.utils.data

device = torch.device("mps") 
train_dataset = MyDataset(x_train, y_train, device)
test_dataset = MyDataset(x_test, y_test, device)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False)

## Архитектура нашей нейронной сети и ее обучение

In [53]:
class RecognizeNet(nn.Module):
    def __init__(self):
        super(RecognizeNet, self).__init__()
        self.layer1 = nn.Linear(6 + N_MFCCS, 100)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(100, 200)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(200, 100)
        self.act3 = nn.ReLU()
        self.layer4 = nn.Linear(100, 50)
        self.act4 = nn.ReLU()
        self.layer5 = nn.Linear(50, 2)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.act1(x)
        x = self.layer2(x)
        x = self.act2(x)
        x = self.layer3(x)
        x = self.act3(x)
        x = self.layer4(x)
        x = self.act4(x)
        x = self.layer5(x)
        return x

In [54]:
model = RecognizeNet()
model.to(device)
total_epochs = 0

In [55]:
total_step = len(train_loader)
epochs = 1000
lr = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr)
loss_fn = nn.CrossEntropyLoss()

loss_list = []
acc_list = []
pbar = tqdm.tqdm(range(epochs))
for epoch in pbar:
    for i, batch in enumerate(train_loader):

        x, y = batch
        preds = model(x)

        loss = loss_fn(preds, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        preds = F.softmax(preds, dim=1)

        total = y.size(0)
        _, predicted = torch.max(preds.data, 1)
        true = y
        correct = (predicted == true).sum().item()
        acc_list.append(correct / total)
    total_epochs += 1
    if (epoch + 1) % 20 == 0:
        pbar.set_description('Epoch [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
                .format(total_epochs, (total_epochs // epochs + 1) * epochs, loss.item(),
                        (correct / total) * 100))

  2%|▏         | 20/1000 [00:06<04:50,  3.37it/s]

Epoch [20/1000], Loss: 0.6585, Accuracy: 62.69%


  4%|▍         | 40/1000 [00:11<04:10,  3.84it/s]

Epoch [40/1000], Loss: 0.6674, Accuracy: 59.58%


  6%|▌         | 60/1000 [00:17<04:04,  3.84it/s]

Epoch [60/1000], Loss: 0.6491, Accuracy: 63.92%


  8%|▊         | 80/1000 [00:22<04:17,  3.58it/s]

Epoch [80/1000], Loss: 0.6524, Accuracy: 63.36%


 10%|█         | 100/1000 [00:27<03:55,  3.83it/s]

Epoch [100/1000], Loss: 0.6444, Accuracy: 65.59%


 12%|█▏        | 120/1000 [00:33<04:04,  3.60it/s]

Epoch [120/1000], Loss: 0.6271, Accuracy: 64.03%


 14%|█▍        | 140/1000 [00:38<03:59,  3.59it/s]

Epoch [140/1000], Loss: 0.6327, Accuracy: 65.26%


 16%|█▌        | 160/1000 [00:44<03:47,  3.69it/s]

Epoch [160/1000], Loss: 0.6398, Accuracy: 63.81%


 18%|█▊        | 180/1000 [00:49<03:46,  3.61it/s]

Epoch [180/1000], Loss: 0.6288, Accuracy: 65.70%


 20%|██        | 200/1000 [00:55<03:31,  3.78it/s]

Epoch [200/1000], Loss: 0.6484, Accuracy: 62.69%


 22%|██▏       | 220/1000 [01:00<03:31,  3.68it/s]

Epoch [220/1000], Loss: 0.6569, Accuracy: 61.69%


 24%|██▍       | 240/1000 [01:06<03:32,  3.57it/s]

Epoch [240/1000], Loss: 0.6388, Accuracy: 64.70%


 26%|██▌       | 260/1000 [01:11<03:16,  3.77it/s]

Epoch [260/1000], Loss: 0.6396, Accuracy: 65.26%


 28%|██▊       | 280/1000 [01:17<03:32,  3.40it/s]

Epoch [280/1000], Loss: 0.6299, Accuracy: 66.59%


 30%|███       | 300/1000 [01:22<03:08,  3.71it/s]

Epoch [300/1000], Loss: 0.6338, Accuracy: 66.04%


 32%|███▏      | 320/1000 [01:28<03:01,  3.75it/s]

Epoch [320/1000], Loss: 0.6419, Accuracy: 64.37%


 34%|███▍      | 340/1000 [01:33<02:59,  3.68it/s]

Epoch [340/1000], Loss: 0.6395, Accuracy: 63.47%


 36%|███▌      | 360/1000 [01:39<02:53,  3.69it/s]

Epoch [360/1000], Loss: 0.6269, Accuracy: 65.26%


 38%|███▊      | 380/1000 [01:44<02:49,  3.66it/s]

Epoch [380/1000], Loss: 0.6297, Accuracy: 65.59%


 40%|████      | 400/1000 [01:49<02:39,  3.76it/s]

Epoch [400/1000], Loss: 0.6438, Accuracy: 61.69%


 42%|████▏     | 420/1000 [01:55<02:57,  3.26it/s]

Epoch [420/1000], Loss: 0.6337, Accuracy: 64.48%


 44%|████▍     | 440/1000 [02:01<02:40,  3.49it/s]

Epoch [440/1000], Loss: 0.6157, Accuracy: 66.15%


 46%|████▌     | 460/1000 [02:07<02:30,  3.59it/s]

Epoch [460/1000], Loss: 0.6217, Accuracy: 65.81%


 48%|████▊     | 480/1000 [02:12<02:26,  3.56it/s]

Epoch [480/1000], Loss: 0.6045, Accuracy: 66.37%


 50%|█████     | 500/1000 [02:18<02:19,  3.57it/s]

Epoch [500/1000], Loss: 0.6231, Accuracy: 66.15%


 52%|█████▏    | 520/1000 [02:24<02:14,  3.58it/s]

Epoch [520/1000], Loss: 0.6145, Accuracy: 67.37%


 54%|█████▍    | 540/1000 [02:29<02:11,  3.49it/s]

Epoch [540/1000], Loss: 0.6106, Accuracy: 65.92%


 56%|█████▌    | 560/1000 [02:35<02:04,  3.53it/s]

Epoch [560/1000], Loss: 0.6170, Accuracy: 66.15%


 58%|█████▊    | 580/1000 [02:41<01:57,  3.58it/s]

Epoch [580/1000], Loss: 0.5890, Accuracy: 68.49%


 60%|██████    | 600/1000 [02:46<02:00,  3.32it/s]

Epoch [600/1000], Loss: 0.6060, Accuracy: 67.59%


 62%|██████▏   | 620/1000 [02:52<01:46,  3.57it/s]

Epoch [620/1000], Loss: 0.5969, Accuracy: 68.60%


 64%|██████▍   | 640/1000 [02:58<01:47,  3.36it/s]

Epoch [640/1000], Loss: 0.6139, Accuracy: 66.59%


 66%|██████▌   | 660/1000 [03:03<01:36,  3.52it/s]

Epoch [660/1000], Loss: 0.6582, Accuracy: 62.03%


 68%|██████▊   | 680/1000 [03:09<01:29,  3.58it/s]

Epoch [680/1000], Loss: 0.5957, Accuracy: 67.15%


 70%|███████   | 700/1000 [03:15<01:23,  3.59it/s]

Epoch [700/1000], Loss: 0.6216, Accuracy: 66.04%


 72%|███████▏  | 720/1000 [03:20<01:19,  3.52it/s]

Epoch [720/1000], Loss: 0.5985, Accuracy: 67.59%


 74%|███████▍  | 740/1000 [03:26<01:12,  3.61it/s]

Epoch [740/1000], Loss: 0.6057, Accuracy: 68.82%


 76%|███████▌  | 760/1000 [03:32<01:10,  3.40it/s]

Epoch [760/1000], Loss: 0.5935, Accuracy: 67.59%


 78%|███████▊  | 780/1000 [03:37<01:02,  3.52it/s]

Epoch [780/1000], Loss: 0.5704, Accuracy: 70.82%


 80%|████████  | 800/1000 [03:43<00:57,  3.49it/s]

Epoch [800/1000], Loss: 0.5903, Accuracy: 68.15%


 82%|████████▏ | 820/1000 [03:49<00:54,  3.30it/s]

Epoch [820/1000], Loss: 0.6136, Accuracy: 65.14%


 84%|████████▍ | 840/1000 [03:55<00:46,  3.41it/s]

Epoch [840/1000], Loss: 0.6118, Accuracy: 65.70%


 86%|████████▌ | 860/1000 [04:01<00:40,  3.42it/s]

Epoch [860/1000], Loss: 0.5952, Accuracy: 66.48%


 88%|████████▊ | 880/1000 [04:07<00:34,  3.45it/s]

Epoch [880/1000], Loss: 0.5863, Accuracy: 67.93%


 90%|█████████ | 900/1000 [04:12<00:28,  3.49it/s]

Epoch [900/1000], Loss: 0.5620, Accuracy: 71.38%


 92%|█████████▏| 920/1000 [04:18<00:25,  3.18it/s]

Epoch [920/1000], Loss: 0.5820, Accuracy: 68.37%


 94%|█████████▍| 940/1000 [04:24<00:17,  3.52it/s]

Epoch [940/1000], Loss: 0.5788, Accuracy: 68.71%


 96%|█████████▌| 960/1000 [04:30<00:11,  3.41it/s]

Epoch [960/1000], Loss: 0.5386, Accuracy: 72.38%


 98%|█████████▊| 980/1000 [04:36<00:05,  3.37it/s]

Epoch [980/1000], Loss: 0.5753, Accuracy: 67.26%


100%|██████████| 1000/1000 [04:42<00:00,  3.54it/s]

Epoch [1000/2000], Loss: 0.5727, Accuracy: 69.04%





## Тестирование нашей нейронки

In [58]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for x, y in test_loader:
        y_pred = model(x)
        _, predicted = torch.max(y_pred.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()

    print('Test Accuracy of the model on the test data: {} %'.format((correct / total) * 100))

Test Accuracy of the model on the test data: 63.25951164111301 %


# Submission

In [59]:
data_scoring = pd.read_csv("test_dataset.csv", index_col=0)
sample_submission = pd.read_csv("test (1).csv")
model.eval()
with torch.no_grad():
    _, results = torch.max(model(torch.FloatTensor(data_scoring.values).to(device)).data, 1)

submission = pd.DataFrame({"Filename": data_scoring.index, "Label": results.to("cpu")})
submission = submission.sort_values(by="Filename")
sample_submission = sample_submission.sort_values(by="Filename")
sample_submission["Label"] = submission["Label"]
sample_submission = sample_submission.sort_index()
sample_submission.to_csv("test.csv", index=False, header=False) 