In [2]:
# Cell 1: Install required libraries
!pip install transformers torchaudio jiwer datasets torch scikit-learn word2number



In [3]:
# Cell 2: Import necessary libraries
import os
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, BertModel, BertTokenizer
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import re
from tqdm.notebook import tqdm
from google.colab import drive
import tarfile
from IPython.display import Audio
from word2number import w2n

In [4]:
# Cell 3: Mount Google Drive and extract the Indian accent dataset
drive.mount('/content/drive')
tar_path = '/content/drive/My Drive/VIKRAM-DRDO/nptel-pure-set.tar.gz'
extraction_path = '/content/nptel-pure-set'
if not os.path.exists(extraction_path):
    os.makedirs(extraction_path)
with tarfile.open(tar_path, 'r:gz') as tar:
    tar.extractall(path=extraction_path)
print('Extraction completed.')


Mounted at /content/drive
Extraction completed.


In [5]:
# Cell 4: Define the ASR (Automatic Speech Recognition) module
class ASRModule:
    def __init__(self, model_name="facebook/wav2vec2-large-960h-lv60-self"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to(self.device)

    def transcribe(self, waveform, sample_rate):
        if waveform.ndim > 1:
            waveform = waveform.mean(dim=0)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.input_values.to(self.device)
        with torch.no_grad():
            logits = self.model(inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.decode(predicted_ids[0])
        return transcription

asr_module = ASRModule()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

In [6]:
# Cell 5: Define the Intent Classifier model
class IntentClassifier(nn.Module):
    def __init__(self, num_intents):
        super(IntentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_intents)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


In [7]:
# Cell 6: Define the Data Processor
class DataProcessor:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.le = LabelEncoder()

    def load_clinc150(self):
        dataset = load_dataset("clinc_oos", "plus")
        train_data = dataset['train']
        test_data = dataset['test']

        X_train = train_data['text']
        y_train = self.le.fit_transform(train_data['intent'])
        X_test = test_data['text']
        y_test = self.le.transform(test_data['intent'])

        return X_train, y_train, X_test, y_test, self.le.classes_

    def load_indian_accent_data(self, base_path):
        dataset = []
        wav_directory = os.path.join(base_path, 'nptel-pure', 'wav')
        txt_directory = os.path.join(base_path, 'nptel-pure', 'corrected_txt')
        for filename in os.listdir(wav_directory):
            if filename.lower().endswith('.wav'):
                audio_path = os.path.join(wav_directory, filename)
                transcript_path = os.path.join(txt_directory, filename.replace('.wav', '.txt'))
                try:
                    waveform, sample_rate = torchaudio.load(audio_path)
                    with open(transcript_path, 'r') as file:
                        transcript = file.read().strip()
                    if transcript:
                        dataset.append((waveform, sample_rate, transcript))
                except Exception as e:
                    print(f"Failed to load or process file {filename}: {e}")
        return dataset

    def prepare_intent_data(self, texts, labels):
        encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
        dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))
        return DataLoader(dataset, batch_size=16, shuffle=True)

data_processor = DataProcessor()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Cell 7: Define utility functions
def extract_distances(instruction):
    pattern = r'(\w+|\d+)\s*(meter|metre|m|feet|ft)'
    matches = re.findall(pattern, instruction, re.IGNORECASE)
    distances = []
    for match in matches:
        try:
            number = w2n.word_to_num(match[0])
        except ValueError:
            try:
                number = float(match[0])
            except ValueError:
                continue
        distances.append(number)
    return distances


In [9]:
# Cell 8: Load and prepare CLINC150 data for intent classification
X_train, y_train, X_test, y_test, intents = data_processor.load_clinc150()
train_loader = data_processor.prepare_intent_data(X_train, y_train)
test_loader = data_processor.prepare_intent_data(X_test, y_test)

print(f"Number of intents: {len(intents)}")
print(f"Number of training samples: {len(X_train)}")
print(f"Number of test samples: {len(X_test)}")

Downloading readme:   0%|          | 0.00/24.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/136k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5500 [00:00<?, ? examples/s]

Number of intents: 151
Number of training samples: 15250
Number of test samples: 5500


In [10]:
# Cell 9: Initialize and train intent classifier
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
intent_classifier = IntentClassifier(len(intents)).to(device)
optimizer = torch.optim.AdamW(intent_classifier.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 5
for epoch in range(num_epochs):
    intent_classifier.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = intent_classifier(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

print("Training completed")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/5:   0%|          | 0/954 [00:00<?, ?it/s]

Epoch 1/5, Average Loss: 3.4743


Epoch 2/5:   0%|          | 0/954 [00:00<?, ?it/s]

Epoch 2/5, Average Loss: 1.0009


Epoch 3/5:   0%|          | 0/954 [00:00<?, ?it/s]

Epoch 3/5, Average Loss: 0.2741


Epoch 4/5:   0%|          | 0/954 [00:00<?, ?it/s]

Epoch 4/5, Average Loss: 0.1111


Epoch 5/5:   0%|          | 0/954 [00:00<?, ?it/s]

Epoch 5/5, Average Loss: 0.0577
Training completed


In [11]:
# Cell 10: Evaluate the intent classifier
intent_classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = intent_classifier(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.4f}%")

Evaluating:   0%|          | 0/344 [00:00<?, ?it/s]

Test Accuracy: 88.3636%


In [15]:
# Cell 11: Load Indian accent data
indian_accent_data = data_processor.load_indian_accent_data(extraction_path)
print(f"Loaded {len(indian_accent_data)} Indian accent audio samples")

Loaded 998 Indian accent audio samples


In [16]:
# Cell 12: Function to process audio and classify intent
def process_instruction(audio_waveform, sample_rate):
    transcription = asr_module.transcribe(audio_waveform, sample_rate)
    encoding = data_processor.tokenizer(transcription, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = intent_classifier(input_ids, attention_mask)

    _, pred = torch.max(outputs, dim=1)
    intent = intents[pred.item()]
    distances = extract_distances(transcription)

    return f"Transcription: {transcription}\nIntent: {intent}\nDistances: {distances}"


In [17]:
# Cell 13: Test with Indian accent data
for waveform, sample_rate, _ in indian_accent_data[:5]:
    result = process_instruction(waveform, sample_rate)
    print(result)
    print('-' * 50)

Transcription: THESE COUNTRIES AR THEY DON'T TEND TO PRODUCE THE WHOLE SET OF VARIETIES IN EUROPE YOU
Intent: 42
Distances: []
--------------------------------------------------
Transcription: GETRESULT TYPE WHICH IS A COMPILER FUNCTION WHICH GOES INTO THE SYMBOL TABLE AND
Intent: 122
Distances: []
--------------------------------------------------
Transcription: THE ORANGE SHADE INDICATES A ALUMINA WHEREAS THE LIGHT YELLOW
Intent: 73
Distances: []
--------------------------------------------------
Transcription: I AM ASSUMING THAT IT IS AREBATIC FLOW IF IT IS NON AREBATIC I CAN HAVE A SITUATION
Intent: 42
Distances: []
--------------------------------------------------
Transcription: ON REFRESHING YOUR CATLES STOCK REPLENISHING THE CATLEY STOCK SO THAT'S ALL THAT
Intent: 42
Distances: []
--------------------------------------------------


In [20]:
!pip install scipy numpy



In [22]:
# Cell 14: Main processing loop for user-uploaded audio
from google.colab import files
import scipy.io.wavfile as wavfile
import numpy as np

print("Audio File Processing")
print("Upload a WAV file with your spoken instruction.")

while True:
    uploaded = files.upload()
    if not uploaded:
        print("No file was uploaded.")
        break

    file_name = list(uploaded.keys())[0]
    sample_rate, audio = wavfile.read(file_name)
    audio = audio.astype(np.float32) / 32768.0

    print(f"Audio shape: {audio.shape}, Sample rate: {sample_rate}")
    display(Audio(audio, rate=sample_rate))

    result = process_instruction(torch.from_numpy(audio), sample_rate)
    print("\nResult:")
    print(result)
    print('-' * 50)

    choice = input("Do you want to process another audio file? (yes/no): ")
    if choice.lower() != 'yes':
        break

print("Processing completed.")

Audio File Processing
Upload a WAV file with your spoken instruction.


Saving recorded_audio_1.wav to recorded_audio_1.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: MOVE FORWARD BY FIVE METERS
Intent: 138
Distances: [5]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_2.wav to recorded_audio_2.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: MOVE FORWARD FIVE METERS
Intent: 138
Distances: [5]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_3.wav to recorded_audio_3 (1).wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: GO BACKWAR TWO METERS AND THEN MOVE UP ONE METER
Intent: 138
Distances: [2, 1]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_4.wav to recorded_audio_4 (2).wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE KITCHEN IS FIVE METERS TO THE LEFT OF THE LIVING ROOM
Intent: 10
Distances: [5]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_5.wav to recorded_audio_5.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THERE IS AN OBSTACLE TWO FEET AHEAD OF YOU
Intent: 17
Distances: [2]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_6.wav to recorded_audio_6.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE TARGET OBJECT IS LOCATED ON THE TOP SHELF ABOUT SIX FEET HIGH
Intent: 42
Distances: [6]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_7.wav to recorded_audio_7.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: YOU NEED TO NAVIGATE THROUGH THE NARROW CORRIDOR APPROXIMATELY ONE WINTER WIDE
Intent: 10
Distances: []
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_8.wav to recorded_audio_8.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE ROOM DIM DRATER INCREASES BY TWO DEGREES EVERY THREE METERS MO TOWARDS THE WINDOW
Intent: 10
Distances: [3]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_9.wav to recorded_audio_9.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THERE IS A GRADUAL INCREASE IN FIFTEEN DEGREES OVER THE NEXT TENMETERES
Intent: 138
Distances: [10]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_12.wav to recorded_audio_12.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE LIGHT INDEN CITYDIGASES BY FIFTEEN PER CENT EVERY TWO METERS TWO MORE AWAY FROM THE LAMP
Intent: 138
Distances: [2, 2]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_13.wav to recorded_audio_13.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE LIGHT INTENSITY DEGREES IS BY FIFTY METERS EVERY TWO METERS IMO AWAY FROM THE RA LAMP
Intent: 31
Distances: [50, 2]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_14.wav to recorded_audio_14.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE WIF SIGNAL STRENGTH IMPROSED SIGNIFICANTLY AFTER MOVING THREE METERS TO THE RIGHT
Intent: 150
Distances: [3]
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_15.wav to recorded_audio_15.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE AIR QUALITY SHOWS TWENTY PER CENT MOVEMENT OF EVERY METER YOU ASCENDER
Intent: 142
Distances: []
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_16.wav to recorded_audio_16.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE MAGNETIC FIES RENT INCREASES AS YOU APPROACH THE NORTH WOLE
Intent: 24
Distances: []
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_17.wav to recorded_audio_17.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE MAGNETIC FIELD TRENGTHING RESE ASNE ACROSH THE NORTH WALL PICKING AT ABOUT SEVENINTES
Intent: 42
Distances: []
--------------------------------------------------
Do you want to process another audio file? (yes/no): yes


Saving recorded_audio_18.wav to recorded_audio_18.wav
Audio shape: (79872,), Sample rate: 16000



Result:
Transcription: THE UMIDIATE LEEL DROPS FIVE BUTS INTO EVERY METERMO TOWARDS THE UM
Intent: 31
Distances: []
--------------------------------------------------
Do you want to process another audio file? (yes/no): no
Processing completed.
