In [None]:
!pip install torch transformers soundfile librosa gensim
!sudo apt install ffmpeg -y


In [None]:
# Install dependencies (run once)
# pip install torch torchaudio transformers

import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Step 1: Load pre-trained Wav2Vec2 model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Step 2: Load audio file
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

# Step 3: Resample audio if needed
def preprocess_audio(waveform, sample_rate):
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    return waveform

# Step 4: Convert speech → text
def speech_to_text(wav_file):
    waveform, sample_rate = load_audio(wav_file)
    waveform = preprocess_audio(waveform, sample_rate)

    inputs = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    return transcription.lower()

# Step 5: Run on your file
wav_file = "1.wav"  # Replace with your own .wav file
text = speech_to_text(wav_file)
print(" Transcription:", text)
