## Подключение всех необходимых библиотек

In [1]:
from IPython.display import Audio, display
import os

import librosa
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn
from sklearn.model_selection import train_test_split
import tqdm
from torch import nn
from torch.functional import F
import torch.utils
import torch.utils.data
import torchaudio
import warnings

# Feature extraction

## Extracting and saving features

In [9]:
for file in os.listdir("normal_audio"):
    y, sr = librosa.load(os.path.join("normal_audio", file))
    if len(y) == 0:
        print(file)

In [19]:
from extract_R import extract_R
#extract_R()
for folder in ["normal_audio", "burr_audio"]:
    for filename in tqdm.tqdm(os.listdir(folder), desc=folder): 
        audio, sr = librosa.load(os.path.join(folder, filename), sr=16000)
        mel = librosa.feature.melspectrogram(y=audio,sr=sr)
        mel_df = pd.DataFrame(mel)
        mel_df.to_csv(os.path.join(folder + "_mel", filename + "_mel.csv"))


normal_audio:   0%|          | 0/15166 [00:00<?, ?it/s]

normal_audio: 100%|██████████| 15166/15166 [00:43<00:00, 347.27it/s]
burr_audio: 100%|██████████| 7818/7818 [00:25<00:00, 307.25it/s]


In [103]:
for filename in tqdm.tqdm(os.listdir("test_audio_raw")):
    audio, sr = librosa.load(os.path.join("test_audio_raw", filename), sr=22050)
    audio = librosa.resample(audio, orig_sr=22050, target_sr=16000)
    if audio.size % 4410 != 0:
        audio = audio[:-int(audio.size % 4410)]
    if not os.path.exists(os.path.join("test_audio_mel", filename)):
        os.mkdir(os.path.join("test_audio_mel", filename))
    splits = np.split(audio, audio.size // 4410)
    for i, split in enumerate(splits):
        mel = librosa.feature.melspectrogram(y=split,sr=16000)
        df = pd.DataFrame(mel)
        df.to_csv(os.path.join("test_audio_mel", filename, f"mel_{i}.csv"))

  0%|          | 0/2870 [00:00<?, ?it/s]

100%|██████████| 2870/2870 [03:29<00:00, 13.68it/s]


## Loading extracted features

In [154]:
warnings.filterwarnings("ignore")
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
X = []
y = []
for folder in ["normal_audio_mel", "burr_audio_mel"]:
    for mel in tqdm.tqdm(os.listdir(folder), desc=folder):    
        df = pd.read_csv(os.path.join(folder, mel), index_col=0)
        X.append(df.values)
        y.append(folder == "burr_audio_mel")
X = torch.tensor(X).unsqueeze(1)
y = torch.tensor(y).unsqueeze(1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

normal_audio_mel: 100%|██████████| 15166/15166 [00:10<00:00, 1459.63it/s]
burr_audio_mel: 100%|██████████| 7818/7818 [00:05<00:00, 1435.37it/s]


In [157]:
class MyDataset(torch.utils.data.Dataset):

    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, ind):
        return self.features[ind], self.labels[ind]

In [158]:
import torch.utils
import torch.utils.data


train_dataset = MyDataset(x_train, y_train)
test_dataset = MyDataset(x_test, y_test)
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=False)

# Preparing our model

## Architecture

In [211]:
def find_conv2d_out_shape(input_shape,conv: nn.Conv2d,pool=2):
    # get conv arguments
    win, hin = input_shape
    kernel_size=conv.kernel_size
    stride=conv.stride
    padding=conv.padding
    dilation=conv.dilation
    if not isinstance(padding, tuple):
        padding = (padding,padding)
    if not isinstance(stride, tuple):
        stride = (stride,stride)
    if not isinstance(dilation, tuple):
        dilation = (dilation,dilation)
    if not isinstance(kernel_size, tuple):
        kernel_size = (kernel_size,kernel_size)
    hout=np.floor((hin 
                   + 2*padding[0] 
                   - dilation[0] * (kernel_size[0]-1) - 1)
                   / stride[0] + 1)
    wout=np.floor((win+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)

    if pool:
        hout/=pool[0]
        wout/=pool[1]
    return int(hout),int(wout)


class MelConvNet(nn.Module):
    def __init__(self):
        super(MelConvNet, self).__init__()
        input_shape = (129, 9)
        out_ch = 3
        
        self.conv1 = nn.Conv2d(1, out_ch, kernel_size=(3,3), stride=(1,1), padding=(0,0))
        self.pool1 = nn.MaxPool2d(kernel_size=(2,2))
        width, height = find_conv2d_out_shape(input_shape, self.conv1, pool=self.pool1.kernel_size)
         
        # self.conv2 = nn.Conv2d(16, out_ch, (3,3), (1,1), (0,0))
        # self.pool2 = nn.MaxPool2d(kernel_size=(2,2))
        # width, height = find_conv2d_out_shape(width, height, self.conv2)
         
        # self.conv3 = nn.Conv2d(16, out_ch, (3,3), (1,1), (0,0))
        # self.pool3 = nn.MaxPool2d(kernel_size=(2,2))
        # width, height = find_conv2d_out_shape(width, height, self.conv3)

        self.flatten = torch.nn.Flatten()
        self.fc = nn.Linear(width * height * out_ch, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
         
        x = self.conv1(x)
        x = self.pool1(x)
         
        # x = self.conv2(x)
        # x = self.pool2(x)
         
        # x = self.conv3(x)
        # x = self.pool3(x)
         
        x = self.flatten(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x


## Fitting the model

In [212]:
model = MelConvNet()
model.to(device)
total_epochs = 0

In [213]:
epochs = 1000
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr)
loss_fn = nn.CrossEntropyLoss()
loaders = {"train": train_loader, "val": test_loader}
accuracy = {"train": [], "val": []}
val_progress = 0 # track accuracy improvement or degradation
VAL_DECREASE_BREAK = -epochs # when decreased x times, model overfitted, stop training
model.train()
pbar = tqdm.tqdm(range(epochs))
for epoch in pbar:
    for key, loader in loaders.items():
        epoch_correct = 0
        epoch_all = 0
        for x_batch, y_batch in loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            if key == "train":
                model.train()
                optimizer.zero_grad()
                outp = model(x_batch)
                loss = loss_fn(outp, y_batch.squeeze(1))
                loss.backward()
                optimizer.step()
            else:
                model.eval()
                with torch.no_grad():
                    outp = model(x_batch)
            total = y_batch.size(0)
            preds = outp > 0.5
            correct = (preds == y_batch).sum().item()
            all_ = len(outp)
            epoch_correct += correct
            epoch_all += all_
        pbar.set_description(f"Loader: {key}. Accuracy: {epoch_correct/epoch_all:.2%}")
        accuracy[key].append(epoch_correct/epoch_all)
        if key == "val" and len(accuracy[key]) >= 2:
            if val_progress > 0:
                val_progress = 0
            if accuracy[key][-1] > accuracy[key][-2]:
                val_progress += 1
            else:
                val_progress -= 1
            if val_progress <= VAL_DECREASE_BREAK:
                print(f"OVERFITTED AT EPOCH {epoch}")
                break
    else: # if not broken with val decrease excision
        total_epochs += 1
        continue
    break
[plt.plot(accuracies, label=name) for name, accuracies in accuracy.items()]
plt.legend()

Loader: val. Accuracy: 61.69%:   9%|▉         | 93/1000 [03:12<31:20,  2.07s/it]  


KeyboardInterrupt: 

# Submission

In [None]:
data_scoring = []
paths = []
for mels_path in tqdm.tqdm(os.listdir("test_audio_mel")): 
    paths.append(mels_path) 
    mels_pack = []
    for mel_path in os.listdir(os.path.join("test_audio_mel", mels_path)): 
        df = pd.read_csv(os.path.join("test_audio_mel", mels_path, mel_path), header=None, index_col=0)
        mels_pack.append(df.values)
    data_scoring.append(mels_pack)

100%|██████████| 2870/2870 [00:31<00:00, 90.25it/s] 


In [240]:
sample_submission = pd.read_csv("test (1).csv")
print("predicting..")
CONFIDENCE = 0.7
model.eval()
predictions = []
with torch.no_grad():
    for mel_pack in tqdm.tqdm(data_scoring):
        mel_pack = torch.tensor(mel_pack, dtype=torch.float32).to(device).unsqueeze(1)
        results = model(mel_pack)
        confident = ~((results < CONFIDENCE) & (results > 1 - CONFIDENCE))
        results = results[confident]
        predicted = (results > 0.5)
        label = predicted.sum() > predicted.size()[0] / 2
        predictions.append(label.item())


submission = pd.DataFrame({"Filename": paths, "Label": predictions})
submission = submission.sort_values(by="Filename")
sample_submission = sample_submission.sort_values(by="Filename")
sample_submission["Label"] = submission["Label"]
sample_submission = sample_submission.sort_index()
sample_submission.to_csv("submission.csv", index=False, header=False) 

predicting..


100%|██████████| 2870/2870 [00:23<00:00, 122.83it/s]
