In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_dir = '/content/drive/My Drive/SpeechCommand'


In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

classestoclassify = [clas for clas in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, clas))]

def get_wav_paths(classestoclassify):
    class_path = os.path.join(data_dir, classestoclassify)
    wav_paths = [os.path.join(class_path, files) for files in os.listdir(class_path) if files.endswith('.wav')]
    return wav_paths

print(classestoclassify)

['happy', 'dog', 'bed', 'tree', 'wow', 'cat', 'eight', 'no', 'go', 'right', 'nine', '_background_noise_', 'seven', 'sheila', 'stop', 'zero', 'one', 'bird', 'three', 'left', 'down', 'up', 'marvin', 'two', 'five', 'six', 'off', 'yes', 'on', 'house', 'four']


In [None]:
def extract_features(audio_path):
    audio, sample_rate = librosa.load(audio_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
    return mfccs.T

In [None]:
'''def extract_spectral_features(wav_path, sample_rate=16000):
    audio, _ = librosa.load(wav_path, sr=sample_rate, mono=True)
    stft = np.abs(librosa.stft(audio))
    centroid = librosa.feature.spectral_centroid(S=stft, sr=sample_rate)
    bandwidth = librosa.feature.spectral_bandwidth(S=stft, sr=sample_rate)
    rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sample_rate)
    flatness = librosa.feature.spectral_flatness(S=stft)
    contrast = librosa.feature.spectral_contrast(S=stft, sr=sample_rate)

    spectral_features = np.concatenate((centroid, bandwidth, rolloff, flatness, contrast), axis=0)
    spectral_features_mean = np.mean(spectral_features, axis=1)
    return spectral_features_mean'''


In [None]:
'''def extract_features(audio_path, sample_rate=16000):
    audio, sr = librosa.load(audio_path, sr=sample_rate, mono=True)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)

    features = np.concatenate((mfccs, spectral_centroid, spectral_bandwidth), axis=0)
    features_mean = np.mean(features, axis=1)

    return features_mean'''

In [None]:
'''def mix_background_noise(speech_signal, noise_signal, noise_weight=0.1):
    start_idx = np.random.randint(0, len(noise_signal) - len(speech_signal))
    segment_of_noise = noise_signal[start_idx : start_idx + len(speech_signal)]
    return np.add(speech_signal, segment_of_noise * noise_weight)

noise_dir = os.path.join(data_dir, '_background_noise_')
noise_paths = get_wav_paths('_background_noise_')
noise_signals = [extract_spectral_features(noise_path) for noise_path in noise_paths]
'''

In [None]:
features_with_labels = []
for subdir in classestoclassify:
    if subdir == '_background_noise_':
        continue  # Skip the background noise directory
    wav_paths = get_wav_paths(subdir)
    for wav_path in wav_paths:
        features = extract_features(wav_path)
        features_with_labels.append((features, subdir))

In [None]:
from sklearn.mixture import GaussianMixture
class_mapping = {'right': 0, 'eight': 1, 'cat': 2, 'tree': 3, 'bed': 4, 'happy': 5, 'go': 6, 'dog': 7, 'no': 8, 'wow': 9, 'nine': 10, 'left': 11, 'stop': 12, 'three': 13, 'sheila': 14, 'one': 15, 'bird': 16, 'zero': 17, 'seven': 18, 'up': 19, 'marvin': 20, 'two': 21, 'house': 22, 'down': 23, 'six': 24, 'yes': 25, 'on': 26, 'five': 27, 'off': 28, 'four': 29}
gmms = {}

n_components = 5

for subclass, index in class_mapping.items():
    subclass_features = np.vstack([features for features, label in features_with_labels if label == subclass])
    gmm = GaussianMixture(n_components=n_components, covariance_type='diag', n_init=3)
    gmm.fit(subclass_features)

    gmms[index] = gmm



In [None]:
import pandas as pd
import os
import numpy as np

# Assuming the following are already defined:
# - extract_features function that outputs averaged features
# - gmms dictionary containing trained GMMs for each class

test_data_folder = '/content/drive/My Drive/SpeechCommandTest'
test_audios = [file for file in os.listdir(test_data_folder) if file.endswith('.wav')]

mapping_csv = '/content/drive/My Drive/test.csv'
mapping_df = pd.read_csv(mapping_csv)
file_to_id = dict(zip(mapping_df['AUDIO_FILE'], mapping_df['ID']))

# Collect test features
test_features = []
for audio_file in test_audios:
    audiopath = os.path.join(test_data_folder, audio_file)
    features = extract_features(audiopath)  # This should do any necessary averaging
    test_features.append(features)

# Assuming all feature vectors have the same length, stack them into a 2D array
X_test = np.vstack(test_features)

# Function to predict the index of the GMM with the highest score
def predict_gmm_class(features, gmms):
    # Ensure features are in the correct shape
    if features.ndim == 1:
        features = features.reshape(1, -1)  # Reshape a 1D array to a 2D array with one sample
    scores = {class_idx: gmm.score(features) for class_idx, gmm in gmms.items()}
    return max(scores, key=scores.get)

# Predict the class for each feature vector in the test set
predictions = [predict_gmm_class(features.reshape(1, -1), gmms) for features in X_test]

# Mapping predictions to file IDs
idpredictions = [(file_to_id[audio_file], prediction) for audio_file, prediction in zip(test_audios, predictions)]

# Create a DataFrame and save to CSV
output_df = pd.DataFrame(idpredictions, columns=['ID', 'TARGET'])
outcsv_path = '/content/drive/My Drive/output_predictions.csv'
output_df.to_csv(outcsv_path, index=False)

print(output_df)


        ID  TARGET
0     6663       2
1     4727      14
2      200       3
3     6368      14
4     3436      20
...    ...     ...
6830   739      19
6831  5961      19
6832  3700      10
6833  1056      24
6834  3225      13

[6835 rows x 2 columns]


In [None]:
#####################################################################################################################################