In [27]:
import transformers
import torch
from transformers import WhisperFeatureExtractor, set_seed, AutoModelForAudioClassification, TrainingArguments, Trainer, WhisperConfig, EarlyStoppingCallback
import datasets
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import pandas as pd
from tqdm import tqdm
import numpy as np
import librosa
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
import seaborn as sns
tqdm.pandas()
set_seed(55)
np.random.seed(55)

# Read audio

In [3]:
def read_audio(path):
    try:
        audio, sample_rate = librosa.load(path, sr=16000)
        return audio
    except:
        return np.array([])

In [4]:
manifest = pd.read_excel('manifest_balanced.xlsx')
variants = ['Arkhangelskie', 'Novgorodskie', 'Povolzkie', 'Pskovskie', 'Ryazanskie']
manifest = manifest[manifest['Variant'].isin(variants)]
manifest = manifest[['path', 'Variant', 'text', 'informant']]
manifest['array'] = manifest['path'].progress_apply(read_audio)
manifest['sampling_rate'] = 16000

100%|████████████████████████████████████████████████████████████████████████████| 25806/25806 [23:55<00:00, 17.98it/s]


# Split on train, validation and test (speakers fixed)

In [5]:
manifest = manifest[['path', 'Variant', 'array', 'sampling_rate', 'informant']]
train = pd.DataFrame()
valid = pd.DataFrame()
test = pd.DataFrame()
variants = ['Arkhangelskie', 'Novgorodskie', 'Povolzkie', 'Pskovskie', 'Ryazanskie']
for variant in variants:
    speaker_counts = manifest[manifest['Variant']==variant]['informant'].value_counts().to_dict()
    total_records = len(manifest[manifest['Variant']==variant]['path'])
    train_target = 0.7 * total_records
    val_target = 0.165 * total_records
    test_target = total_records - train_target - val_target
    speaker_ids = list(speaker_counts.keys())
    train_speakers, val_speakers, test_speakers = [], [], []
    train_sum = val_sum = test_sum = 0
    for speaker in speaker_ids:
        count = speaker_counts[speaker]
        if train_sum + count <= train_target:
            train_speakers.append(speaker)
            train_sum += count
        elif val_sum + count <= val_target:
            val_speakers.append(speaker)
            val_sum += count
        else:
            test_speakers.append(speaker)
            test_sum += count
    train_df = manifest[(manifest['Variant']==variant)&(manifest['informant'].isin(train_speakers))]
    valid_df = manifest[(manifest['Variant']==variant)&(manifest['informant'].isin(val_speakers))]
    test_df = manifest[(manifest['Variant']==variant)&(manifest['informant'].isin(test_speakers))]
    train = pd.concat([train, train_df])
    valid = pd.concat([valid, valid_df])
    test = pd.concat([test, test_df])
train = train[train['array'].str.len()>0]
valid = valid[valid['array'].str.len()>0]
test = test[test['array'].str.len()>0]
try:
    del manifest
    del train_df
    del valid_df
    del test_df
except:
    pass
gc.collect()

12705

In [6]:
train = train.rename(columns={'Variant': 'label'})
valid = valid.rename(columns={'Variant': 'label'})
test = test.rename(columns={'Variant': 'label'})

In [7]:
unique_labels = ['Arkhangelskie', 'Novgorodskie', 'Povolzkie', 'Pskovskie', 'Ryazanskie']
label2id = {label: index for index, label in enumerate(unique_labels)}
id2label = {index: label for index, label in enumerate(unique_labels)}
print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'Arkhangelskie': 0, 'Novgorodskie': 1, 'Povolzkie': 2, 'Pskovskie': 3, 'Ryazanskie': 4}
id2label: {0: 'Arkhangelskie', 1: 'Novgorodskie', 2: 'Povolzkie', 3: 'Pskovskie', 4: 'Ryazanskie'}


# Configure the Dataset

In [8]:
ds = DatasetDict({
    'train': Dataset.from_pandas(train[['array', 'label']]).class_encode_column("label"),
    'test': Dataset.from_pandas(valid[['array', 'label']]).class_encode_column("label"),
    'valid': Dataset.from_pandas(test[['array', 'label']]).class_encode_column("label")})

Casting to class labels:   0%|          | 0/17347 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4194 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3990 [00:00<?, ? examples/s]

In [9]:
ds['train'].features["label"].str2int('Arkhangelskie')

0

In [10]:
try:
    del train
    del valid
    del test
except:
    pass
gc.collect()

144

# Extract features from audio

In [11]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

In [12]:
def prepare_dataset(batch):
    batch["input_features"] = feature_extractor(batch["array"], sampling_rate=16000, return_tensors='pt').input_features[0]
    return batch

In [13]:
encoded_audio = ds.map(prepare_dataset, remove_columns="array")

Map:   0%|          | 0/17347 [00:00<?, ? examples/s]

Map:   0%|          | 0/4194 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

In [14]:
encoded_audio = encoded_audio.remove_columns(['__index_level_0__'])

In [15]:
encoded_audio['train'].features

{'label': ClassLabel(names=['Arkhangelskie', 'Novgorodskie', 'Povolzkie', 'Pskovskie', 'Ryazanskie'], id=None),
 'input_features': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None)}

# Define function for testing

In [16]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Upload model

In [68]:
num_labels = len(id2label)
config = WhisperConfig.from_pretrained("openai/whisper-tiny", num_labels=num_labels, label2id=label2id, id2label=id2label)
config.classifier_dropout = 0.3
model = AutoModelForAudioClassification.from_pretrained("openai/whisper-tiny", config=config)
device = "cpu"
model.to(device)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-tiny and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


WhisperForAudioClassification(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 384)
    (layers): ModuleList(
      (0-3): 4 x WhisperEncoderLayer(
        (self_attn): WhisperSdpaAttention(
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_proj): Linear(in_features=384, out_features=384, bias=True)
          (q_proj): Linear(in_features=384, out_features=384, bias=True)
          (out_proj): Linear(in_features=384, out_features=384, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_

# Train classifier only 

In [69]:
def freeze_encoder(model):
    for param in model.encoder.parameters():
        param.requires_grad = False

In [70]:
freeze_encoder(model)

In [71]:
batch_size = 8
training_args_classifier = TrainingArguments(output_dir=f"whisper-tiny-finetuned-rudialect-fixed-speakers-classifier-only",
                                  eval_strategy ="steps",
                                  save_strategy="steps",
                                  learning_rate=1e-4,
                                  eval_steps=200,
                                  save_steps=200,
                                  per_device_train_batch_size=batch_size,
                                  gradient_accumulation_steps=4,
                                  per_device_eval_batch_size=batch_size,
                                  num_train_epochs=5,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  greater_is_better=True,
                                  use_cpu=True, 
                                  report_to="none")

In [72]:
trainer = Trainer(model=model,
                  args=training_args_classifier,
                  train_dataset=encoded_audio["train"].with_format("torch").shuffle(seed=42),
                  eval_dataset=encoded_audio["valid"].with_format("torch"),
                  tokenizer=feature_extractor,
                  compute_metrics=compute_metrics, 
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

  trainer = Trainer(model=model,


In [73]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
200,No log,1.516844,0.326566,0.17201
400,No log,1.505464,0.332331,0.23642
600,1.493700,1.505873,0.316541,0.255721
800,1.493700,1.4822,0.348622,0.291816
1000,1.377700,1.480709,0.35614,0.30008
1200,1.377700,1.44623,0.378446,0.321424
1400,1.377700,1.455946,0.384211,0.342372
1600,1.299300,1.443962,0.395489,0.357523
1800,1.299300,1.425205,0.403509,0.355646
2000,1.248200,1.404091,0.412531,0.354456


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

TrainOutput(global_step=2000, training_loss=1.3547186889648437, metrics={'train_runtime': 7311.4684, 'train_samples_per_second': 11.863, 'train_steps_per_second': 0.371, 'total_flos': 7.1163353099088e+17, 'train_loss': 1.3547186889648437, 'epoch': 3.6841862609497467})

In [74]:
output_test = trainer.predict(encoded_audio["test"].with_format("torch"))

In [75]:
output_test.metrics

{'test_loss': 1.4064407348632812,
 'test_accuracy': 0.42727706247019553,
 'test_f1': 0.3649119947207621,
 'test_runtime': 295.2187,
 'test_samples_per_second': 14.206,
 'test_steps_per_second': 1.778}

# Train encoder

In [76]:
def unfreeze_last_n_layers(model, n):
    total_layers = len(model.encoder.layers)
    for i in range(total_layers - n, total_layers):
        for param in model.encoder.layers[i].parameters():
            param.requires_grad = True

In [77]:
unfreeze_last_n_layers(model, 2)

In [78]:
training_args_encoder = TrainingArguments(output_dir=f"whisper-tiny-finetuned-rudialect-fixed-speakers-with-encoder",
                                          eval_strategy="steps",
                                          save_strategy="steps",
                                          eval_steps=300,
                                          save_steps=300,
                                          learning_rate=1e-5,
                                          per_device_train_batch_size=batch_size,
                                          gradient_accumulation_steps=4,
                                          per_device_eval_batch_size=batch_size,
                                          num_train_epochs=5,
                                          load_best_model_at_end=True,
                                          metric_for_best_model="f1",
                                          greater_is_better=True,
                                          use_cpu=True, 
                                          report_to="none")

In [79]:
trainer_encoder = Trainer(model=model,
                          args=training_args_encoder,
                          train_dataset=encoded_audio["train"].with_format("torch").shuffle(seed=42),
                          eval_dataset=encoded_audio["valid"].with_format("torch"),
                          tokenizer=feature_extractor,
                          compute_metrics=compute_metrics, 
                          callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

  trainer_encoder = Trainer(model=model,


In [80]:
trainer_encoder.train()

Step,Training Loss,Validation Loss,Accuracy,F1
300,No log,1.13478,0.574185,0.521195
600,0.929700,1.00283,0.634586,0.578388
900,0.929700,0.981909,0.649875,0.600095
1200,0.565000,0.938433,0.653383,0.595468
1500,0.415500,0.955443,0.667419,0.622709
1800,0.415500,0.907219,0.683459,0.632552
2100,0.357800,0.903619,0.69599,0.645265
2400,0.357800,0.907136,0.692732,0.64682
2700,0.314600,0.913585,0.685213,0.635663


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

TrainOutput(global_step=2710, training_loss=0.5001308933835188, metrics={'train_runtime': 14373.8735, 'train_samples_per_second': 6.034, 'train_steps_per_second': 0.189, 'total_flos': 9.6428442285504e+17, 'train_loss': 0.5001308933835188, 'epoch': 4.992162286768096})

In [81]:
output_test = trainer_encoder.predict(encoded_audio["test"].with_format("torch"))

In [82]:
output_test.metrics

{'test_loss': 1.3440494537353516,
 'test_accuracy': 0.5815450643776824,
 'test_f1': 0.5466198287395072,
 'test_runtime': 294.8818,
 'test_samples_per_second': 14.223,
 'test_steps_per_second': 1.78}

In [90]:
trainer.save_model('/whisper-tiny-finetuned-rudialect-fixed-speakers-with-encoder/model')

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
