In [5]:
import transformers
import torch
from transformers import WhisperFeatureExtractor, set_seed
import datasets
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import pandas as pd
from tqdm import tqdm
import numpy as np
import librosa
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import noisereduce as nr
tqdm.pandas()
set_seed(55)

# Read and denoise audio

In [2]:
def read_audio(path):
    try:
        audio, sample_rate = librosa.load(path, sr=16000)
        return audio
    except:
        return np.array([])

In [3]:
def denose_audio(audio):
    return nr.reduce_noise(y=audio, sr=44100)

In [4]:
manifest = pd.read_excel('manifest_balanced.xlsx')
manifest = manifest[['path', 'Variant', 'text']]
manifest['audio'] = manifest['path'].progress_apply(read_audio)
manifest['sampling_rate'] = 16000
manifest['array'] = manifest['audio'].progress_apply(denose_audio)

100%|██████████████████████████████████████████████████████████████████████████| 55657/55657 [1:15:43<00:00, 12.25it/s]
100%|████████████████████████████████████████████████████████████████████████████| 55657/55657 [49:21<00:00, 18.79it/s]


# Split on train, validation and test

In [6]:
manifest = manifest[['path', 'Variant', 'array', 'sampling_rate']]
train = pd.DataFrame()
valid = pd.DataFrame()
test = pd.DataFrame()
for variant in list(manifest['Variant'].unique()):
    train_df, valid_test_df = train_test_split(manifest[manifest['Variant']==variant], test_size=0.3, random_state=55)
    valid_df, test_df = train_test_split(valid_test_df, test_size=0.5, random_state=55)
    train = pd.concat([train, train_df])
    valid = pd.concat([valid, valid_df])
    test = pd.concat([test, test_df])
train = train[train['array'].str.len()>0]
valid = valid[valid['array'].str.len()>0]
test = test[test['array'].str.len()>0]
try:
    del manifest
    del train_df
    del valid_test_df
    del valid_df
    del test_df
except:
    pass
gc.collect()

8760

In [7]:
train = train.rename(columns={'Variant': 'label'})
valid = valid.rename(columns={'Variant': 'label'})
test = test.rename(columns={'Variant': 'label'})

In [8]:
unique_labels = ['Arkhangelskie', 'Desninskie', 'Donskie', 'Kostromskie', 'Mezhzonalnie', 'Novgorodskie', 'Povolzkie',
                 'Pskovskie', 'Ryazanskie', 'Seligerskie']
label2id = {label: index for index, label in enumerate(unique_labels)}
id2label = {index: label for index, label in enumerate(unique_labels)}
print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'Arkhangelskie': 0, 'Desninskie': 1, 'Donskie': 2, 'Kostromskie': 3, 'Mezhzonalnie': 4, 'Novgorodskie': 5, 'Povolzkie': 6, 'Pskovskie': 7, 'Ryazanskie': 8, 'Seligerskie': 9}
id2label: {0: 'Arkhangelskie', 1: 'Desninskie', 2: 'Donskie', 3: 'Kostromskie', 4: 'Mezhzonalnie', 5: 'Novgorodskie', 6: 'Povolzkie', 7: 'Pskovskie', 8: 'Ryazanskie', 9: 'Seligerskie'}


# Configure the Dataset

In [9]:
ds = DatasetDict({
    'train': Dataset.from_pandas(train[['array', 'label']]).class_encode_column("label"),
    'test': Dataset.from_pandas(valid[['array', 'label']]).class_encode_column("label"),
    'valid': Dataset.from_pandas(test[['array', 'label']]).class_encode_column("label")})

Casting to class labels:   0%|          | 0/38022 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/8129 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/8152 [00:00<?, ? examples/s]

In [10]:
try:
    del train
    del valid
    del test
except:
    pass
gc.collect()

72

# Extract features from audio

In [11]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

In [12]:
def prepare_dataset(batch):
    batch["input_features"] = feature_extractor(batch["array"], sampling_rate=16000, return_tensors='pt').input_features[0]
    return batch

In [13]:
encoded_audio = ds.map(prepare_dataset, remove_columns="array")

Map:   0%|          | 0/38022 [00:00<?, ? examples/s]

Map:   0%|          | 0/8129 [00:00<?, ? examples/s]

Map:   0%|          | 0/8152 [00:00<?, ? examples/s]

In [14]:
encoded_audio = encoded_audio.remove_columns(['__index_level_0__'])

In [15]:
encoded_audio['train'].features

{'label': ClassLabel(names=['Arkhangelskie', 'Desninskie', 'Donskie', 'Kostromskie', 'Mezhzonalnie', 'Novgorodskie', 'Povolzkie', 'Pskovskie', 'Ryazanskie', 'Seligerskie'], id=None),
 'input_features': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None)}

# Define function for testing

In [16]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Upload model

In [18]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained("openai/whisper-tiny", num_labels=num_labels, label2id=label2id, id2label=id2label)
device = "cpu"
model.to(device)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-tiny and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


WhisperForAudioClassification(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 384)
    (layers): ModuleList(
      (0-3): 4 x WhisperEncoderLayer(
        (self_attn): WhisperSdpaAttention(
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_proj): Linear(in_features=384, out_features=384, bias=True)
          (q_proj): Linear(in_features=384, out_features=384, bias=True)
          (out_proj): Linear(in_features=384, out_features=384, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_

# Set hyperparameters and train

In [19]:
batch_size = 8
training_args = TrainingArguments(output_dir=f"whisper-tiny-finetuned-rudialect-denoised",
                                  eval_strategy ="epoch",
                                  save_strategy="epoch",
                                  learning_rate=3e-5,
                                  per_device_train_batch_size=batch_size,
                                  gradient_accumulation_steps=4,
                                  per_device_eval_batch_size=batch_size,
                                  num_train_epochs=5,
                                  warmup_ratio=0.1,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1", 
                                  use_cpu=True, 
                                  report_to="none")

In [20]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=encoded_audio["train"].shuffle(seed=42).with_format("torch"),
                  eval_dataset=encoded_audio["valid"].with_format("torch"),
                  tokenizer=feature_extractor,
                  compute_metrics=compute_metrics)

  trainer = Trainer(model=model,


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7648,0.438485,0.862856,0.849874
2,0.2816,0.239304,0.92738,0.920151
3,0.1276,0.199755,0.9421,0.936298
4,0.0161,0.166854,0.962954,0.959999


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

TrainOutput(global_step=5940, training_loss=0.3274283163475268, metrics={'train_runtime': 44406.4199, 'train_samples_per_second': 4.281, 'train_steps_per_second': 0.134, 'total_flos': 2.11562216015616e+18, 'train_loss': 0.3274283163475268, 'epoch': 4.996423311592678})

In [22]:
output_test = trainer.predict(encoded_audio["test"].with_format("torch"))

In [23]:
output_test.metrics

{'test_loss': 0.1427411437034607,
 'test_accuracy': 0.9689998769836389,
 'test_f1': 0.9665994429635025,
 'test_runtime': 582.2955,
 'test_samples_per_second': 13.96,
 'test_steps_per_second': 1.747}

In [24]:
trainer.save_model('/whisper-tiny-finetuned-rudialect-denoised/model')

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
