In [2]:
import sounddevice as sd
from scipy.io.wavfile import write

def record_audio(filename='user_input.wav', duration=3, fs=16000):
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
    sd.wait()
    write(filename, fs, audio)
    print(f"Recording finished and saved as {filename}")
    return filename



In [3]:
import librosa
import torch

def extract_features_from_audio(audio_path, processor, wav2vec_model, device):
    audio, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        features = wav2vec_model(inputs.input_values.to(device)).last_hidden_state.squeeze(0)
    return features.cpu(), features.shape[0]  # features, length


In [4]:
import pandas as pd

train_csv = 'train_data.csv'


def add_intent_column(csv_path):
    df = pd.read_csv(csv_path)
    df['intent'] = df['action'].fillna('') + '_' + df['object'].fillna('') + '_' + df['location'].fillna('')
    df['intent'] = df['intent'].str.strip('_')
    df.to_csv(csv_path, index=False)
    return df

train_df = add_intent_column(train_csv)

# For training data
train_df = pd.read_csv(train_csv)
train_df['intent'] = train_df['action'].fillna('') + "_" + \
                     train_df['object'].fillna('') + "_" + \
                     train_df['location'].fillna('')


all_labels = train_df['intent'].unique().tolist()
label2idx = {lbl: i for i, lbl in enumerate(all_labels)}
idx2label = {i: lbl for lbl, i in label2idx.items()}
num_classes = len(label2idx)

In [5]:
num_classes

31

In [6]:
import torch.nn as nn

class CNN_GRU_Model(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, 256, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Conv1d(256, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128)
        )
        self.gru = nn.GRU(128, 128, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(128*2, num_classes)

    def forward(self, x, lengths):
        # x: [batch, time, feat]
        x = x.transpose(1,2) # [batch, feat, time]
        x = self.cnn(x)
        x = x.transpose(1,2) # [batch, time, feat]
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.gru(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        last_outputs = out[torch.arange(out.size(0)), lengths-1]
        logits = self.classifier(last_outputs)
        return logits


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cpu


In [8]:
# Reconstruct your model architecture
input_dim = 768  # Wav2Vec2 output dim (check your model)
num_classes = len(label2idx)  # You already have this

model = CNN_GRU_Model(input_dim, num_classes).to(device)
model.load_state_dict(torch.load('best_cnn_gru_model_2.pt', map_location=device))
model.eval()


CNN_GRU_Model(
  (cnn): Sequential(
    (0): Conv1d(768, 256, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): ReLU()
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (gru): GRU(128, 128, batch_first=True, bidirectional=True)
  (classifier): Linear(in_features=256, out_features=31, bias=True)
)

In [9]:
def predict_intent(audio_path, processor, wav2vec_model, model, device, idx2label):
    features, length = extract_features_from_audio(audio_path, processor, wav2vec_model, device)
    # Prepare batch dimensions
    features = features.unsqueeze(0).to(device)  # [1, time, feat]
    lengths = torch.tensor([length]).to(device)
    with torch.no_grad():
        logits = model(features, lengths)
        pred_idx = logits.argmax(dim=1).item()
        intent = idx2label[pred_idx]
    return intent

In [10]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)
wav2vec_model.eval()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [28]:
# 1. Record audio
audio_path = record_audio(duration=3) 

# 2. Predict intent
intent = predict_intent(audio_path, processor, wav2vec_model, model, device, idx2label)
print("Predicted intent:", intent)


Recording...
Recording finished and saved as user_input.wav
Predicted intent: decrease_volume_none
