In [1]:
from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification, Trainer, TrainingArguments
import datasets
import torch

In [2]:
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
config = ASTConfig.from_pretrained(pretrained_model)
config.num_labels = 5
config.max_length = 256
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model, max_length=256)
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- audio_spectrogram_transformer.embeddings.position_embeddings: found shape torch.Size([1, 1214, 768]) in the checkpoint and torch.Size([1, 302, 768]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
config.num_labels, model.num_labels

(5, 5)

In [33]:
features = datasets.Features(
    {
        "path": datasets.Value(dtype="string", id=None),
        "label": datasets.Value(dtype="int64", id=None),
    }
)

stream_dataset = datasets.load_dataset(
    "csv",
    data_files="test10.csv",
    streaming=True,
    split="train",
    features=features,
)

stream_dataset = stream_dataset.cast_column(
    "path", datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
)

Using custom data configuration default-2e6732257563a6cc


In [34]:
columns = next(iter(stream_dataset)).keys()

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


In [35]:
sample = next(iter(stream_dataset))
audio_array = sample["path"]["array"]
audio_array.shape

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


(3200,)

In [4]:
feature_extractor.sampling_rate

16000

In [37]:
audio_array.shape

(3200,)

In [38]:
inputs = feature_extractor(audio_array, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
print(inputs.keys())
print(inputs["input_values"].shape)

dict_keys(['input_values'])
torch.Size([1, 256, 128])


In [39]:
frequency_out_dimension, time_out_dimension = model.audio_spectrogram_transformer.embeddings.get_shape(config)
frequency_out_dimension, time_out_dimension

(12, 25)

In [40]:
model.audio_spectrogram_transformer.embeddings.position_embeddings.shape

torch.Size([1, 302, 768])

In [41]:
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits

print(logits.shape)
predicted_class_ids = torch.argmax(logits, dim=-1).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

torch.Size([1, 5])


'LABEL_2'

In [86]:
def label_transforms(batch):
    sample = batch["path"]
    array = feature_extractor(sample["array"], sampling_rate=feature_extractor.sampling_rate)["input_values"][0]
    return {"input_values": array, "labels": batch["label"]}

In [87]:
stream_dataset = stream_dataset.map(label_transforms, remove_columns=columns)
next(iter(stream_dataset))["input_values"]

array (1024, 128)


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


array([[-0.8392106 , -1.1044025 , -0.72758013, ..., -0.43784383,
        -0.5127181 , -0.5110664 ],
       [-1.0823809 , -1.2709085 , -0.89408606, ..., -0.71892273,
        -0.730073  , -0.78606874],
       [-1.1481965 , -1.2775939 , -0.98967904, ..., -0.795274  ,
        -0.8601598 , -0.87460697],
       ...,
       [ 0.46703237,  0.46703237,  0.46703237, ...,  0.46703237,
         0.46703237,  0.46703237],
       [ 0.46703237,  0.46703237,  0.46703237, ...,  0.46703237,
         0.46703237,  0.46703237],
       [ 0.46703237,  0.46703237,  0.46703237, ...,  0.46703237,
         0.46703237,  0.46703237]], dtype=float32)

In [88]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    print("eval_pred", eval_pred)
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [89]:
training_args = TrainingArguments(
                    output_dir="audio_spectrogram",
                    do_train=True,
                    do_eval=True,
                    per_device_train_batch_size=2,
                    per_device_eval_batch_size=2,
                    max_steps=10,
                    no_cuda=True,
                )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [92]:
from transformers import integrations
from transformers import DefaultFlowCallback

# Initialize our trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=stream_dataset.with_format("torch"),
    eval_dataset=stream_dataset.with_format("torch"),
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
    callbacks=[DefaultFlowCallback, integrations.TensorBoardCallback]
)

In [93]:
train_result = trainer.train()
print("train_result", train_result)

***** Running training *****
  Num examples = 20
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 86192645
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


array (1024, 128)
array (1024, 128)


Step,Training Loss


array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)




Training completed. Do not forget to share your model on huggingface.co/models =)




train_result TrainOutput(global_step=10, training_loss=2.116594696044922, metrics={'train_runtime': 95.2673, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.105, 'total_flos': 1287908429660160.0, 'train_loss': 2.116594696044922, 'epoch': 1.4})


In [94]:
metrics = trainer.evaluate()
metrics

***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 2
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
array (1024, 128)
eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7f3d29c32940>


{'eval_loss': 0.9849345684051514,
 'eval_accuracy': 0.7272727272727273,
 'eval_runtime': 17.7526,
 'eval_samples_per_second': 0.62,
 'eval_steps_per_second': 0.338,
 'epoch': 1.4}

In [56]:
from transformers import AutoConfig, AutoModel, Wav2Vec2Processor, WavLMForSequenceClassification
import datasets
import torch
import numpy as np

In [57]:
# model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"
processor_name = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
model_name = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
processor = Wav2Vec2Processor.from_pretrained(processor_name)
model = WavLMForSequenceClassification.from_pretrained(model_name, num_labels=5)

Some weights of the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-base-plus were not used when initializing WavLMForSequenceClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing WavLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-base-plus and are newly initialized: ['classifier.bias', 'projector.bias', 'classifier.weight', 'projector.weight']
You should probably TRAIN this model on a down-stream task to b

In [58]:
model.config.num_labels

5

In [59]:
processor.feature_extractor.sampling_rate

16000

In [60]:
features = datasets.Features(
    {
        "path": datasets.Value(dtype="string", id=None),
        "label": datasets.Value(dtype="int64", id=None),
    }
)

stream_dataset = datasets.load_dataset(
    "csv",
    data_files="test10.csv",
    streaming=True,
    split="train",
    features=features,
)

stream_dataset = stream_dataset.cast_column(
    "path", datasets.features.Audio(sampling_rate=processor.feature_extractor.sampling_rate)
)

Using custom data configuration default-2e6732257563a6cc


In [61]:
next(iter(stream_dataset))

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


{'path': {'path': '/data/tone_speech_cutwav/kaldi_cutwavs_new_2020/test/cutwav_word_dataset002_cutok_tones/1/ds002__line-03887__ind-015__len-0.2__gop-097__rr-100__pin-dan__tone-1----dān.wav',
  'array': array([ 0.00726318,  0.00769043,  0.00289917, ..., -0.05279541,
         -0.07974243, -0.08511353], dtype=float32),
  'sampling_rate': 16000},
 'label': 0}

In [62]:
def train_transforms(batch):
    sample = batch["path"]
    encoded_features = processor(
        sample["array"],
        sampling_rate=processor.feature_extractor.sampling_rate,
        max_length=int(16000*1.5),
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
    )
    # print("encoded_features", encoded_features)
    array = encoded_features["input_values"][0]
    attention_mask = encoded_features["attention_mask"][0]
    return {"input_values": array, "attention_mask": attention_mask}

In [63]:
stream_dataset = stream_dataset.map(train_transforms, remove_columns=["path", "label"])

In [64]:
next(iter(stream_dataset))["attention_mask"]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [65]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_values = []
    attention_mask = []
    for b in batch:
        input_values.append(b["input_values"])
        # input_ids.append(b["input_ids"])
        attention_mask.append(b["attention_mask"])

    input_values = torch.from_numpy(np.stack(input_values)).type(torch.FloatTensor)
    #input_ids = torch.from_numpy(np.stack(input_ids)).type(torch.LongTensor)
    attention_mask = torch.from_numpy(np.stack(attention_mask)).type(torch.LongTensor)
    return {
        "input_values": input_values,
        #"input_ids": input_ids,
        "attention_mask": attention_mask,
    }

In [66]:
train_dataloader = DataLoader(stream_dataset.with_format("torch"), batch_size=2, collate_fn=collate_fn)

In [67]:
test_batch = next(iter(train_dataloader))

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


In [68]:
test_batch["input_values"].shape, test_batch["attention_mask"].shape, 

(torch.Size([2, 24000]), torch.Size([2, 24000]))

In [70]:
model(**test_batch).keys()

odict_keys(['logits'])

In [31]:
model(**test_batch).last_hidden_state.shape

torch.Size([2, 74, 768])

In [40]:
outputs = model(**test_batch)
outputs[0].shape, model.config.classifier_proj_size, model.config.num_labels

(torch.Size([2, 74, 768]), 256, 2)