In [1]:
import librosa
import os
import pickle
from scipy.io import wavfile
import numpy as np
import pandas as pd
import random
from pydub import AudioSegment
from tqdm import tqdm

from preprocessing import perform_vad, resample_wav, padding, cut_wav_into_clips, extract_features




In [2]:
wav_dir = "train\\audio"

## Get full training list

In [40]:
labels_path = 'train\\validation_list.txt'
output_labels_path = 'train\\training_list.txt'
with open(labels_path, "r") as f:
    lines = f.readlines()
file_paths = [line.strip() for line in lines]

In [43]:
with open(output_labels_path, "w") as f1:
    for folder in os.listdir(wav_dir):
        if folder == '_background_noise_' or folder == 'silence':
            continue
        else:
            for file in os.listdir(os.path.join(wav_dir, folder)):
                path = f"{folder}/{file}"
                if path not in file_paths:
                    f1.write(path + '\n')

In [None]:


with open(output_labels_path', "w") as f1:
    for file in part1:
        f1.write("silence/" + file + "\n")

# Feature extraction

## `unknown` detection / `label` classification

### Training part

In [44]:
labels_path = 'train\\training_list.txt'
with open(labels_path, "r") as f:
    lines = f.readlines()
file_paths_train = [line.strip() for line in lines]


In [53]:
features_train = []

for i, file in tqdm(enumerate(file_paths_train), total=len(file_paths_train), leave=True):
    wav_file = os.path.join(wav_dir,file.split('/')[0],file.split('/')[1])
    wav_file2 = 'working_sample.wav'
    label = file.split('/')[0]

    # Preprocess the data  
    perform_vad(wav_file, wav_file2)
    padding(wav_file2, wav_file2, 1000)
    resample_wav(wav_file2, wav_file2, 8000)

    # Extract features
    features = extract_features(wav_file2)

    # Add to the list
    features_train.append([features, label])

100%|██████████| 57923/57923 [1:12:09<00:00, 13.38it/s]


In [56]:
with open('extracted_features\\features_training.pkl', 'wb') as f:
    pickle.dump(features_train, f)

### Validation part

In [9]:
labels_path = 'train\\validation_list.txt'
with open(labels_path, "r") as f:
    lines = f.readlines()
file_paths_val = [line.strip() for line in lines]

In [15]:
features_val = []

for i, file in tqdm(enumerate(file_paths_val), total=len(file_paths_val), leave=True):

    wav_file = os.path.join(wav_dir,file.split('/')[0],file.split('/')[1])
    wav_file2 = 'working_sample.wav'
    label = file.split('/')[0]

    # Preprocess the data  
    perform_vad(wav_file, wav_file2)
    padding(wav_file2, wav_file2, 1000)
    resample_wav(wav_file2, wav_file2, 8000)

    # Extract features
    features = extract_features(wav_file2)

    # Add to the list
    features_val.append([features, label])

100%|██████████| 6798/6798 [08:02<00:00, 14.08it/s]


In [4]:
with open('extracted_features\\features_validation.pkl', 'wb') as f:
    pickle.dump(features_val, f)

## `silence` detection

### Silence clips list creation

In [3]:
input_folder = "train\\audio\\_background_noise_"
output_folder = "train\\audio\\silence"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for filename in os.listdir(input_folder):
    if filename.endswith(".wav"):
        cut_wav_into_clips(f"{input_folder}\\{filename}", output_folder, filename, clip_duration_ms=1000)

In [23]:
dir_path = "train\\audio\\silence"

file_list = os.listdir(dir_path)

random.shuffle(file_list)

# Split the list into two parts
split_index = int(len(file_list) * 0.8)
part1 = file_list[:split_index]
part2 = file_list[split_index:]

# Define the output file paths
file1_path = "train\\silence_testing_list.txt"
file2_path = "train\\silence_validation_list.txt"

# Write the paths to the files to the output text files
with open(file1_path, "w") as f1:
    for file in part1:
        f1.write("silence/" + file + "\n")

with open(file2_path, "w") as f2:
    for file in part2:
        f2.write("silence/" + file + "\n")

### Training part

#### Training part of non-silence (but without VAD detection)

In [9]:
labels_path = 'train\\training_list.txt'
with open(labels_path, "r") as f:
    lines = f.readlines()
file_paths_train = [line.strip() for line in lines]

In [11]:
features_train = []

for i, file in tqdm(enumerate(file_paths_train), total=len(file_paths_train), leave=True):
    wav_file = os.path.join(wav_dir,file.split('/')[0],file.split('/')[1])
    wav_file2 = 'working_sample.wav'
    label = file.split('/')[0]

    # Preprocess the data  
    padding(wav_file, wav_file2, 1000)
    resample_wav(wav_file2, wav_file2, 8000)

    # Extract features
    features = extract_features(wav_file2)

    # Add to the list
    features_train.append([features, label])

100%|██████████| 57923/57923 [52:34<00:00, 18.36it/s]  


In [12]:
with open('extracted_features\\non_silence_training.pkl', 'wb') as f:
    pickle.dump(features_train, f)

#### Training part of silence

In [3]:
with open('train\\silence_training_list.txt', 'r') as file:
    lines = file.readlines()
file_paths_train_silence = [line.strip() for line in lines]

In [5]:
features_train_silence = []

for i, file in tqdm(enumerate(file_paths_train_silence), total=len(file_paths_train_silence), leave=True):
    wav_file = os.path.join(wav_dir,file.split('/')[0],file.split('/')[1])
    wav_file2 = 'working_sample.wav'
    label = file.split('/')[0]

    # Preprocess the data  (without VAD)
    padding(wav_file, wav_file2, 1000)
    resample_wav(wav_file2, wav_file2, 8000)

    # Extract features
    features = extract_features(wav_file2)

    # Add to the list
    features_train_silence.append([features, label])

100%|██████████| 321/321 [00:17<00:00, 18.44it/s]


#### Augment silence to 5000 samples

In [6]:
target_num_clips = 5000
k = 0
n = len(features_train_silence)

while k + n < target_num_clips:
    
    for i, file in enumerate(file_paths_train_silence):
        
        wav_file = os.path.join(wav_dir,file.split('/')[0],file.split('/')[1])
        
        if k + n >= target_num_clips:
            break

        wav_file2 = 'working_sample.wav'
        label = file.split('/')[0]

        # Preprocess the data  (without VAD)
        padding(wav_file, wav_file2, 1000)
        resample_wav(wav_file2, wav_file2, 8000)

        audio, sr = librosa.load(wav_file2, sr=16000)
        noise = np.random.randn(len(audio))
        noise_level = 0.1
        audio_noise = audio + noise_level * noise
        pitch_shift = np.random.uniform(-100, 100)
        audio_pitch = librosa.effects.pitch_shift(audio_noise, sr=sr, n_steps=pitch_shift/100.0)
        wavfile.write(wav_file2, sr, audio_pitch.astype(np.float32))
        
        features = extract_features(wav_file2)

        # Add to the list
        features_train_silence.append([features, label])
   
        k += 1

In [7]:
len(features_train_silence)

5000

In [8]:
with open('extracted_features\\silence_augmented_training.pkl', 'wb') as f:
    pickle.dump(features_train_silence, f)

#### Concatenate results

In [16]:
#read pickle
with open('extracted_features\\non_silence_training.pkl', 'rb') as f1, open('extracted_features\\silence_augmented_training.pkl', 'rb') as f2:
    non_silence = pickle.load(f1)
    silence = pickle.load(f2)
with open('extracted_features\\silence_detection_training.pkl', 'wb') as f:
    pickle.dump(non_silence + silence, f)

### Validation part

In [16]:
with open('train\\silence_validation_list.txt', 'r') as file:
    lines = file.readlines() + file.readlines()
file_paths_val_silence = [line.strip() for line in lines]

In [17]:
features_val_silence = []

for i, file in tqdm(enumerate(file_paths_val_silence), total=len(file_paths_val_silence), leave=True):
    wav_file = os.path.join(wav_dir,file.split('/')[0],file.split('/')[1])
    wav_file2 = 'working_sample.wav'
    label = file.split('/')[0]

    # Preprocess the data  (without VAD)
    padding(wav_file, wav_file2, 1000)
    resample_wav(wav_file2, wav_file2, 8000)

    # Extract features
    features = extract_features(wav_file2)

    # Add to the list
    features_val_silence.append([features, label])

100%|██████████| 81/81 [00:04<00:00, 18.88it/s]


In [18]:
with open('extracted_features\\silence_detection_validation.pkl', 'wb') as f:
    pickle.dump(features_val + features_val_silence, f)