In [121]:
import pickle 

import pandas as pd
from pathlib import Path
import pandas as pd
import numpy as np

import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, AutoConfig, AudioClassificationPipeline

from tqdm.auto import tqdm
from sklearn.neighbors import KNeighborsClassifier

In [3]:
"""
TalTechNLP/voxlingua107-epaca-tdnn - 0.66
speechbrain/google_speech_command_xvector - 0.90
anton-l/distilhubert-ft-common-language - 0
anton-l/sew-d-mid-400k-ft-keyword-spotting - >300 MB

pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-15960

w11wo/distil-wav2vec2-adult-child-cls-v3 - 199MB 
anantoj/distil-wav2vec2-adult-child-cls - 144MB
ntu-spml/distilhubert - 89MB
"""

'\nTalTechNLP/voxlingua107-epaca-tdnn - 0.66\nspeechbrain/google_speech_command_xvector - 0.90\nanton-l/distilhubert-ft-common-language - 0\nanton-l/sew-d-mid-400k-ft-keyword-spotting - >300 MB\n\n\nw11wo/distil-wav2vec2-adult-child-cls-v3 - 199MB \nanantoj/distil-wav2vec2-adult-child-cls - 144MB\nntu-spml/distilhubert - 89MB\n'

In [146]:
DATA_PATH = Path("../data")
WEIGHTS_PATH = Path("pretrained_models/xvector_finetuned")
EXP_NAME = WEIGHTS_PATH.name
DEVICE = "cuda"

CLASSES = [
    "down",
    "go",
    "left",
    "no",
    "off",
    "on",
    "right",
    "stop",
    "up",
    "yes",
]
label2id = dict([[v, k] for k, v in enumerate(CLASSES)])
id2label = dict([[k, v] for k, v in enumerate(CLASSES)])

cos_sim = torch.nn.CosineSimilarity(dim=1)
N_NEIGHBORS = 20

## Speechbrain

In [123]:
enc_classifier = EncoderClassifier.from_hparams(
    source=WEIGHTS_PATH,
    savedir=Path("pretrained_models") / EXP_NAME,
    run_opts={"device": DEVICE},
)
audio_normalizer = enc_classifier.audio_normalizer
label_encoder = enc_classifier.hparams.label_encoder

In [124]:
def load_audio(path):
    signal, sr = torchaudio.load(str(path), channels_first=False)
    return audio_normalizer(signal, sr)

In [138]:
classifier = KNeighborsClassifier(
    n_neighbors=N_NEIGHBORS,
    metric="cosine",
    n_jobs=-1,
)
classifier

KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=20)

In [143]:
audio_filepaths = sorted(list((DATA_PATH / "train").rglob("*.wav")))
rel_length = torch.tensor([1.0])
lengths = []
classes = []
embeddings = []
for audiofile in tqdm(audio_filepaths):
    class_name = audiofile.parts[-2]

    wav = load_audio(audiofile).unsqueeze(0)
    embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0).squeeze(0)

    classes.append(class_name)
    embeddings.append(embedding.cpu().numpy())
    lengths.append(wav.shape[-1] / 16000)

  0%|          | 0/88790 [00:00<?, ?it/s]

In [129]:
test_audio_filepaths = sorted(list((DATA_PATH / "test").glob("*.wav")))
pred = []
probas = []
for audiofile in tqdm(test_audio_filepaths):
    wav = load_audio(audiofile).unsqueeze(0)
    output = enc_classifier.classify_batch(wav, rel_length)
    class_name = output[-1][-1]
    out_probs = output[0]
    probas.append(out_probs)
    pred.append(class_name)

  0%|          | 0/29620 [00:00<?, ?it/s]

## AutoFeatureExtractor

In [96]:
# config = AutoConfig.from_pretrained(WEIGHTS_PATH)
# print(config)
enc_model = AutoModelForAudioClassification.from_pretrained(WEIGHTS_PATH)
# enc_model.save_pretrained(Path("pretrained_models") / EXP_NAME)
enc_model.classifier = torch.nn.Linear(256, 256)
enc_model = enc_model.to(DEVICE)
print(enc_model)


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (5): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        )
   

In [100]:
audio_filepaths = sorted(list((DATA_PATH / "train").rglob("*.wav")))

classes = []
embeddings = []
for audiofile in tqdm(audio_filepaths):
    class_name = audiofile.parts[-2]

    wav, sr = torchaudio.load(audiofile)
    wav = wav.to(DEVICE)
    embedding = enc_model(wav).logits[0]

    classes.append(class_name)
    embeddings.append(embedding.detach().cpu().numpy())

  0%|          | 0/88790 [00:00<?, ?it/s]

## Submit

In [7]:
train_df = pd.DataFrame(
    {
        "filepath": [str(ap) for ap in audio_filepaths],
        "category": [CLASSES[name] for name in classes],
        "embedding": embeddings 
    }
)

In [8]:
train_df.head()

Unnamed: 0,filepath,category,embedding
0,../data/train/down/0.wav,3,"[15.127331, 11.903127, 3.8200674, -0.08791563,..."
1,../data/train/down/1.wav,3,"[14.426889, 15.1516075, 3.523267, 2.2925537, 9..."
2,../data/train/down/10.wav,3,"[17.777142, 2.83203, 0.29956368, 9.412637, 12...."
3,../data/train/down/100.wav,3,"[12.743387, 12.060911, 0.9379118, 7.697887, 14..."
4,../data/train/down/1000.wav,3,"[14.446474, 8.608712, 8.189811, 10.2905855, 8...."


In [9]:
train_df.shape

(88790, 3)

In [None]:
# train_df.to_csv("../data/train_xvector.csv", index=False)

In [110]:
train_embeddings = torch.tensor(embeddings, device=DEVICE)

pred = []
test_audio_filepaths = sorted(list((DATA_PATH / "test").glob("*.wav")))
for audiofile in tqdm(test_audio_filepaths):
    # wav = load_audio(audiofile).unsqueeze(0)
    # embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0)

    wav, sr = torchaudio.load(audiofile)
    wav = wav.to(DEVICE)
    embedding = enc_model(wav).logits
    
    similarities = cos_sim(embedding, train_embeddings)
    max_similar_idx = similarities.argmax()
    class_name = classes[max_similar_idx]

    pred.append(class_name)

  train_embeddings = torch.tensor(embeddings, device=DEVICE)


  0%|          | 0/29620 [00:00<?, ?it/s]

In [9]:
pd.Series(pred).value_counts()

down    29620
dtype: int64

In [13]:
pd.Series(pred).value_counts()

down     3208
stop     2967
left     2965
on       2954
up       2950
off      2932
yes      2932
no       2924
right    2916
go       2872
dtype: int64

In [111]:
sub = pd.DataFrame(data={
    "id": [a.stem for a in test_audio_filepaths],
    "category": pred,
})
sub.to_csv(f"submission_{EXP_NAME}_cos_sim.csv", index=False)

## KNN

In [144]:
# Train KNN
classifier.fit(embeddings, [label2id[name] for name in classes])
# save knn model
with open("knn_xvector.pkl", "wb") as fout:
    pickle.dump(classifier, fout)

In [141]:
test_audio_filepaths = sorted(list((DATA_PATH / "test").glob("*.wav")))
test_embeddings = []
for audiofile in tqdm(test_audio_filepaths):
    wav = load_audio(audiofile).unsqueeze(0)
    embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0).squeeze(0)
    test_embeddings.append(embedding.cpu().numpy())

  0%|          | 0/29620 [00:00<?, ?it/s]

In [142]:
test_embeddings[0].shape, embeddings[0].shape

((512,), (256,))

## Voting

In [21]:
import torch.nn as nn
import torch
import torchaudio.transforms as T
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [54]:
def load_efficientnet(weights_path=None):
    # === Efficientnet ===
    model = torch.hub.load(
        "NVIDIA/DeepLearningExamples:torchhub",
        "nvidia_efficientnet_b0",
        pretrained=True,
    )
    model.stem.conv = nn.Conv2d(
        1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False
    )
    num_ftrs = model.classifier.fc.in_features
    model.classifier.fc = nn.Linear(num_ftrs, 10, bias=True)

    if weights_path:
        model.load_state_dict(torch.load(weights_path))

    return model

MAX_AUDIO_LEN = 16000
class MelCreator:
    def __init__(self) -> None:
        self.make_melspec = T.MelSpectrogram(
            sample_rate=16000,
            n_fft=1024,
            win_length=1024,
            hop_length=128,
            f_min=55.0,
            f_max=7600,
            pad=0,
            n_mels=128,
            window_fn=torch.hann_window,
            power=2.0,
            normalized=True,
            center=False,
            pad_mode="reflect",
            onesided=True,
            norm="slaney",  #'slaney',
            mel_scale="htk",
        )
        self.atdb = T.AmplitudeToDB(stype="power", top_db=80)

    def __call__(self, audio):
        melspec = self.atdb(self.make_melspec(audio))
        return melspec


def __pad_audio(audio):
    if MAX_AUDIO_LEN - audio.shape[-1] > 0:
        i = np.random.randint(0, MAX_AUDIO_LEN - audio.shape[-1])
    else:
        i = 0
    pad_patern = (i, MAX_AUDIO_LEN - audio.shape[-1] - i)
    audio = F.pad(audio, pad_patern, "constant").detach()

    return audio

mel_creator = MelCreator()

def preprocess(audio_path):
    audio, _ = torchaudio.load(audio_path)
    audio = __pad_audio(audio)
    audio = audio / audio.abs().max()
    melspec = mel_creator(audio)
    return melspec

class TestData(Dataset):
    def __init__(self, audio_dir: Path, markup_path) -> None:
        super().__init__()
        self.audio_len = MAX_AUDIO_LEN
        self.mel_creator = MelCreator()

        self.audio_paths = list()
        self.classes = list()
        markup = pd.read_csv(markup_path)
        for file_name, category in markup.values:
            audio, _ = torchaudio.load(audio_dir / f"{file_name}.wav")
            self.audio_paths.append(audio)
            self.classes.append(category)

    def __len__(self):
        return len(self.classes)

    def __getitem__(self, idx):
        audio = self.audio_paths[idx]
        audio = self.__pad_audio(audio)
        audio = audio / audio.abs().max()
        melspec = self.mel_creator(audio)

        return melspec, CLASSES.index(self.classes[idx])

    def __pad_audio(self, audio):
        if self.audio_len - audio.shape[-1] > 0:
            i = np.random.randint(0, self.audio_len - audio.shape[-1])
        else:
            i = 0
        pad_patern = (i, self.audio_len - audio.shape[-1] - i)
        audio = F.pad(audio, pad_patern, "constant").detach()

        return audio

In [55]:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

# resnet = torch.jit.load()
eff_net = load_efficientnet("pretrained_models/efficientnet_40ep.pt")
eff_net = eff_net.to(DEVICE)
xvector = enc_classifier

# weights = [2, 1]

Using cache found in /home/and/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [74]:
test_audio_filepaths = sorted(list((DATA_PATH / "test").glob("*.wav")))
knn_probas = []
for audiofile in tqdm(test_audio_filepaths):
    # with torch.no_grad():
        # xvector 
        wav = load_audio(audiofile).unsqueeze(0)
        embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0).squeeze(0)
        knn_proba = classifier.predict_proba([embedding.cpu().numpy()])[0]
        knn_probas.append(knn_proba)
        # # efficientnet
        # melspec = preprocess(audiofile).to(DEVICE)
        # eff_logits = eff_net(melspec.unsqueeze(0))
        # eff_probas = torch.softmax(eff_logits, dim=1)[0].cpu()

        # avg_probas = np.average(
        #     np.stack([knn_probas, eff_probas]), 
        #     axis=0,
        #     weights=[1, 2]
        # )

        # res = id2label[avg_probas.argmax()]
        # pred.append(res)


  0%|          | 0/29620 [00:00<?, ?it/s]

KeyboardInterrupt: 

## final

In [145]:
probas = classifier.predict_proba(test_embeddings)
pred = [id2label[np.argmax(p)] for p in probas]

In [148]:
probas = [torch.softmax(p, dim=1)[0].tolist() for p in probas]

TypeError: softmax() received an invalid combination of arguments - got (numpy.ndarray, dim=int), but expected one of:
 * (Tensor input, int dim, torch.dtype dtype)
 * (Tensor input, name dim, *, torch.dtype dtype)


In [134]:
sub = pd.DataFrame(data={
    "path": [str(a) for a in test_audio_filepaths],
    "category": pred,
    "proba0": [p[0] for p in probas],
    "proba1": [p[1] for p in probas],
    "proba2": [p[2] for p in probas],
    "proba3": [p[3] for p in probas],
    "proba4": [p[4] for p in probas],
    "proba5": [p[5] for p in probas],
    "proba6": [p[6] for p in probas],
    "proba7": [p[7] for p in probas],
    "proba8": [p[8] for p in probas],
    "proba9": [p[9] for p in probas],
})
sub.to_csv(f"probas_test_{EXP_NAME}-eer-0.025.csv", index=False)

In [147]:
sub = pd.DataFrame(data={
    "id": [a.stem for a in test_audio_filepaths],
    "category": pred,
})
sub.to_csv(f"submission_{EXP_NAME}-eer-0.025-knn20.csv", index=False)

In [None]:

from pathlib import Path

import pickle
import numpy as np
import pandas as pd
import torch
from sklearn.linear_model import LogisticRegression
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

from efficientnet_train import CLASSES, DEVICE
from efficientnet_train import TrainData as EfficientnetTrainData
from efficientnet_train import TestData
from efficientnet_train import load_model as load_efficientnet_model
from resnet_train import TrainData as ResnetTrainData
from resnet_train import load_model as load_resnet_model

softmax = nn.Softmax(dim=0)


def resnet_train_infer(model):
    noise_dir = DATA_PATH / "noises"
    train_dir = DATA_PATH / "hackaton_ds/train"
    train = ResnetTrainData(train_dir, noise_dir)
    train_loader = DataLoader(train, batch_size=1, shuffle=False, num_workers=0)

    train_answers = list()
    for X, _ in tqdm(train_loader):
        # X = X.unsqueeze(0)
        preds = model.forward(X.to(DEVICE))
        logists = softmax(preds[0]).cpu().data.numpy()
        train_answers.append(logists)

    return np.asfarray(train_answers), train.classes


def efficientnet_train_infer(model):
    noise_dir = PROJECT_DIR / "hackaton_ds/noises"
    train_dir = PROJECT_DIR / "hackaton_ds/train"
    train = EfficientnetTrainData(train_dir, noise_dir)
    train_loader = DataLoader(train, batch_size=1, shuffle=False, num_workers=0)

    train_answers = list()
    for X, _ in tqdm(train_loader):
        # X = X.unsqueeze(0)
        preds = model.forward(X.to(DEVICE))
        logists = softmax(preds[0]).cpu().data.numpy()
        train_answers.append(logists)

    return np.asfarray(train_answers), train.classes


if __name__ == "__main__":
    # test_dir = PROJECT_DIR / "hackaton_ds/test"
    # test_markup = PROJECT_DIR / "hackaton_ds/submission_xvector_cos_sim.csv"
    # test = TestData(test_dir, test_markup)
    # test_loader = DataLoader(test, batch_size=1, shuffle=False, num_workers=0)
    resnet = load_resnet_model().to(DEVICE)
    resnet.load_state_dict(
        torch.load(PROJECT_DIR / "models/Resnet_16/resnet16_95ep.pt")
    )
    resnet.eval()

    efficientnet = load_efficientnet_model().to(DEVICE)
    efficientnet.load_state_dict(
        torch.load(PROJECT_DIR / "models/Efficientnet/efficientnet_70ep.pt")
    )
    efficientnet.eval()

    resnet_train_answers, classes1 = resnet_train_infer(resnet)
    classes = [CLASSES.index(class_name) for class_name in classes1]