# Inferencing Demo

This notebook goes over how to use our models for inferencing. These inferencing functions will be implemented in a web app.

###Setup

In [1]:
!pip install -q torch
!pip install -q datasets
!pip install -q transformers
!pip install -q huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m806.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.2/124.2 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
import numpy as np
import datasets
from datasets import load_dataset, DatasetDict,  Audio
from huggingface_hub import PyTorchModelHubMixin

## Push Model to Hub

This section goes over the code to push a model to the hub. NOTE: you _should not_ need to run this code. This section is included for replicability and transparency on how we uploaded the model.

In [36]:
from getpass import getpass

In [39]:
# Hugging face access token with write access
access_token = getpass()

··········


In [56]:
# Define model class

class SpeechClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(SpeechClassifier, self).__init__()
        self.encoder = WhisperModel.from_pretrained(config["encoder"])
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, config["num_labels"])
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

In [3]:
model_checkpoint = "openai/whisper-base"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

In [57]:
# load model params from pt file

state_dict = torch.load('./whisper_best_model-tune.pt', map_location=torch.device('cpu'))

num_labels = 2

# create model
config = {"num_labels": 2, "encoder": "openai/whisper-base"}
model = SpeechClassifier(config).to(device)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [58]:
# create model
config = {"num_labels": 2, "encoder": "openai/whisper-base"}

# move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# save locally
model.save_pretrained("whisper-hypernasality")

# push to the hub
model.push_to_hub("vanderbilt-dsi/whisper-hypernasality", token=access_token, config=config)

# reload
#model = MyModel.from_pretrained("username/my-awesome-model")

CommitInfo(commit_url='https://huggingface.co/vanderbilt-dsi/whisper-hypernasality/commit/bb2dcd82579a633d59c98a7df63b2c5e302fecb3', commit_message='Push model using huggingface_hub.', commit_description='', oid='bb2dcd82579a633d59c98a7df63b2c5e302fecb3', pr_url=None, pr_revision=None, pr_num=None)

## Load Model from Hub

This section shows how you can load the model from the hub. Complete functions are included at the bottom of the notebook, in the next section.

In [None]:
# Define model class

class SpeechClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(SpeechClassifier, self).__init__()
        self.encoder = WhisperModel.from_pretrained(config["encoder"])
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, config["num_labels"])
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

In [None]:
# define data class
class SpeechInferenceDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id



      return input_features, decoder_input_ids

In [None]:
# EXAMPLE
# for the app, use gradio to upload/record an audio file
audio_file_path = ["./ACPA 2 more in grammar school, and out little 3 year old.wav"]

In [None]:
model_checkpoint = "openai/whisper-base"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

inference_data = datasets.Dataset.from_dict({"audio": audio_file_path}
                                                ).cast_column("audio", Audio(sampling_rate=16_000))
inference_dataset = SpeechInferenceDataset(inference_data, feature_extractor)
inference_loader = DataLoader(inference_dataset, batch_size=1, shuffle=False)

In [59]:
# reload model from hub
model = SpeechClassifier.from_pretrained("vanderbilt-dsi/whisper-hypernasality")
model.eval()

In [66]:
# run predictions. 1 is hypernasality, 0 is no hypernasality
with torch.no_grad():
    input_features, decoder_input_ids = next(iter(inference_loader))
    input_features = input_features.squeeze(1).to(device)
    decoder_input_ids = decoder_input_ids.squeeze(1).to(device)
    logits = model(input_features, decoder_input_ids)
    predicted_ids = torch.argmax(logits, dim=-1)

predicted_ids


tensor([1])

## Wrap it all in a function

For the user interface, include the below classes and functions, as well as the libraries at the top (repeated here for ease of use)

In [None]:
# install libraries:
# torch
# datasets
# transformers
# huggingface_hub

In [None]:
# libraries
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
import numpy as np
import datasets
from datasets import load_dataset, DatasetDict,  Audio
from huggingface_hub import PyTorchModelHubMixin

In [None]:
# define data class
class SpeechInferenceDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id



      return input_features, decoder_input_ids

In [None]:
# Define model class

class SpeechClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(SpeechClassifier, self).__init__()
        self.encoder = WhisperModel.from_pretrained(config["encoder"])
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, config["num_labels"])
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

In [67]:
#NOTE: AUDIO_FILE_PATH MUST BE A LIST WITH A STRING OF THE PATH TO THE AUDIO FILE
#NOTE: ENSURE GRADIO AUDIO "TYPE" ARGUMENT IS "FILEPATH" : https://www.gradio.app/docs/audio
def prepare_data(audio_file_path, model_checkpoint = "openai/whisper-base"):
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
  inference_data = datasets.Dataset.from_dict({"audio": audio_file_path}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))
  inference_dataset = SpeechInferenceDataset(inference_data, feature_extractor)
  inference_loader = DataLoader(inference_dataset, batch_size=1, shuffle=False)

  input_features, decoder_input_ids = next(iter(inference_loader))

  input_features = input_features.squeeze(1).to(device)
  decoder_input_ids = decoder_input_ids.squeeze(1).to(device)

  return input_features, decoder_input_ids


def predict(audio_file_path, config = config):
  input_features, decoder_input_ids = prepare_data(audio_file_path)
  model = SpeechClassifier.from_pretrained("vanderbilt-dsi/whisper-hypernasality")
  model.eval()
  with torch.no_grad():
    logits = model(input_features, decoder_input_ids)
    predicted_ids = int(torch.argmax(logits, dim=-1))

  return predicted_ids

In [69]:
# EXAMPLE
# for the app, use gradio to upload/record an audio file
audio_file_path = ["./ACPA 2 more in grammar school, and out little 3 year old.wav"]

In [70]:
# to inference, run the predict function
predict(audio_file_path)

1