In [None]:
# Import necessary libraries
import speech_recognition as sr
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Naive Bayes Classifier is chosen for its simplicity and effectiveness in text classification tasks.
# Other options to explore could include traditional models (logistic regression, support vector machines, decision trees or random forests), or deep learning models (RNNS, CNNs, Transformers) or pre-trained models (Google Speech API, DeepSpeech, Wave2Vec), depending on the complexity of the task and dataset size.

# Step 1: Load pre-recorded audio and convert it to text
def load_audio(file_path):
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(file_path) as source:
            audio = recognizer.record(source)  # Read the entire audio file
            text = recognizer.recognize_google(audio) # Convert audio to text using Google Speech Recognition
            print(f"Audio transcription: {text}")
            return text
    except sr.UnknownValueError:
        print("Sorry, could not understand the audio.")
        return None
    except sr.RequestError:
        print("Request error from Google Speech Recognition service.")
        return None

# Step 2: Prepare training data
# Example dataset (text and labels)
data = [
    ("so", "adverb"),
    ("we", "pronoun"),
    ("hour", "noun"),
]

texts, labels = zip(*data)

# Step 3: Convert text data into features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
y = labels

# Step 4: Train a simple machine learning model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 5: Test the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Step 6: Use the model for speech recognition
def classify_text(text):
    if text:
        text_features = vectorizer.transform([text])
        prediction = model.predict(text_features)
        print(f"Predicted label: {prediction[0]}")
    else:
        print("No text to classify.")

# Load audio file and classify
file_path = "../data/asr_testing/hour.wav"  # Replace with the actual file path
spoken_text = load_audio(file_path)
classify_text(spoken_text)

Accuracy: 0.0
Sorry, could not understand the audio.
No text to classify.
