In [23]:
import numpy as np
import librosa
import deeplake
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE

In [24]:
def standardize_mfcc_length(mfcc_list, target_length=100):
    """Standardize MFCC matrices to a fixed length"""
    result = []
    for mfcc in mfcc_list:
        current_length = mfcc.shape[1]
        if current_length > target_length:
            # Truncate if longer than target
            standardized = mfcc[:, :target_length]
        else:
            # Pad with zeros if shorter
            padding = np.zeros((mfcc.shape[0], target_length - current_length))
            standardized = np.hstack([mfcc, padding])
        
        # Flatten the matrix for traditional ML models
        result.append(standardized.flatten())
    
    return np.array(result)

In [25]:
TARGET_LABELS = {"stop", "go", "up", "down", "forward", "backward"}

def extract_filtered_mfcc_features(dataset, n_mfcc=13, sample_rate=16000):
    """
    Extracts MFCC features only for the selected words.
    
    Parameters:
    - dataset: The Deeplake dataset containing audio tensors.
    - n_mfcc: Number of MFCC coefficients to extract.
    - sample_rate: Target sample rate for librosa processing.
    
    Returns:
    - X: NumPy array of MFCC features (num_samples, n_mfcc)
    - y: NumPy array of corresponding labels
    """
    
    X, y = [], []
    
    for i in range(len(dataset)):
        point_label = dataset.labels[i].data()['text'][0]  

        if point_label in TARGET_LABELS:  # Keep only target samples
            
            audio = dataset['audios'][i].numpy().squeeze()  # Extract audio data
            
            # Compute MFCC
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
            # mfcc_mean = np.mean(mfcc, axis=1)  # second method: Can take mean of factors for more simplistic feature set
            
            X.append(mfcc)
            y.append(point_label)
            
    return standardize_mfcc_length(X), np.array(y)

In [26]:

# Load dataset
dataset = deeplake.load('hub://activeloop/speech-commands-train')



Opening dataset in read-only mode as you don't have write permissions.


\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/speech-commands-train



\

hub://activeloop/speech-commands-train loaded successfully.



 

In [None]:
# Extract features
# X, y = extract_filtered_mfcc_features(dataset)

# Scale/ normalize features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balance dataset out / generate synthetic samples for under-represented classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)



In [28]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# RandomizedSearchCV (GridSearchCV can take a long long time when not using average mfcc's )
# Hypertuning Parameters
param_distributions = {
    'C': [0.01, 0.1, 1.0, 10.0],  # Regularization strength
    'solver': ['newton-cg', 'sag', 'saga', 'liblinear'],  
    'penalty': ['l2', None],  # Regularization method
}


search = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, random_state=42, tol=1e-4, class_weight='balanced'),
    param_distributions=param_distributions,
    verbose = 2,  # Show logs as it runs
    n_iter=6,  # Only sample 6 combinations
    cv=3,  # Folds for cross validation
    n_jobs=-1  # Use all CPU cores
)


# Grid search 
# param_grid = {
#     'C': [0.1, 1.0, 10.0],  # Reduced from 4 to 3 values
#     'solver': ['newton-cg', 'saga', 'sag'],  # Reduced from 4 to 2 solvers
#     'penalty': ['l2']  # Reduced to just l2 penalty
# }

# search = GridSearchCV(
#     LogisticRegression(max_iter=1000, random_state=42, tol=1e-4, class_weight='balanced'),
#     param_grid,
#     verbose=2,  # Show logs as it runs
#     cv=3,  # Folds for cross validation
#     n_jobs=-1  # Use all CPU cores
# )


# Fit the model
print("Training model...")
search.fit(X_train, y_train)

print(f"Best parameters found: {search.best_params_}")





Training model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


6 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Toazt\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Toazt\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Toazt\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 441, in _check_solver
    raise ValueError(
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got None.



Best parameters found: {'solver': 'sag', 'penalty': 'l2', 'C': 0.01}


In [29]:
# Evaluate model
y_pred = search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Logistic Regression Accuracy: {accuracy:.4f}")

print("\nModel Performance:")
print(classification_report(y_test, y_pred))

# 68 is both random and grid so far with all mffc's, not average used

Optimized Logistic Regression Accuracy: 0.6854

Model Performance:
              precision    recall  f1-score   support

        down       0.70      0.68      0.69       476
          go       0.66      0.70      0.68       476
        stop       0.74      0.70      0.72       476
          up       0.64      0.66      0.65       476

    accuracy                           0.69      1904
   macro avg       0.69      0.69      0.69      1904
weighted avg       0.69      0.69      0.69      1904

