In [9]:
# !pip install musdb numpy librosa tensorflow numpy
# ! pip install pyaudio

In [11]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, UpSampling1D, concatenate
from tensorflow.keras.models import Model

# Function to convert audio to spectrogram
def audio_to_spectrogram(audio, sr=44100, n_fft=2048, hop_length=512):
    stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    spectrogram = np.abs(stft)
    return spectrogram

# Function to prepare data from the MUSDB18-HQ dataset
def prepare_data(folder_path):
    X = []
    Y = []
    for subset in ['train', 'test']:
        subset_path = os.path.join(folder_path, subset)
        for track_folder in os.listdir(subset_path):
            track_path = os.path.join(subset_path, track_folder)
            if os.path.isdir(track_path):
                mix_path = os.path.join(track_path, 'mixture.wav')
                vocal_path = os.path.join(track_path, 'vocals.wav')

                mix, sr = librosa.load(mix_path, sr=None, mono=False)
                vocal, sr = librosa.load(vocal_path, sr=None, mono=False)
                accompaniment = mix - vocal

                mix_spec = np.array([audio_to_spectrogram(mix[0]), audio_to_spectrogram(mix[1])])
                vocal_spec = np.array([audio_to_spectrogram(vocal[0]), audio_to_spectrogram(vocal[1])])
                accompaniment_spec = np.array([audio_to_spectrogram(accompaniment[0]), audio_to_spectrogram(accompaniment[1])])

                mix_spec = np.moveaxis(mix_spec, 0, -1)
                vocal_spec = np.moveaxis(vocal_spec, 0, -1)
                accompaniment_spec = np.moveaxis(accompaniment_spec, 0, -1)

                X.append(mix_spec)
                Y.append(np.stack((vocal_spec, accompaniment_spec), axis=-1))

    return np.array(X), np.array(Y)

# Load dataset from folder
folder_path = 'musdb18hq'
X_train, Y_train = prepare_data(folder_path)

# Model Definition
def wave_unet_model(input_size=(44100, 2)):
    inputs = Input(input_size)

    # Encoder
    conv1 = Conv1D(24, 15, activation='relu', padding='same')(inputs)
    pool1 = Conv1D(24, 15, strides=2, activation='relu', padding='same')(conv1)

    conv2 = Conv1D(48, 15, activation='relu', padding='same')(pool1)
    pool2 = Conv1D(48, 15, strides=2, activation='relu', padding='same')(conv2)

    conv3 = Conv1D(96, 15, activation='relu', padding='same')(pool2)
    pool3 = Conv1D(96, 15, strides=2, activation='relu', padding='same')(conv3)

    conv4 = Conv1D(192, 15, activation='relu', padding='same')(pool3)
    pool4 = Conv1D(192, 15, strides=2, activation='relu', padding='same')(conv4)

    # Bottleneck
    bottleneck = Conv1D(384, 15, activation='relu', padding='same')(pool4)

    # Decoder
    up4 = UpSampling1D(size=2)(bottleneck)
    up4 = concatenate([up4, conv4], axis=-1)
    up_conv4 = Conv1D(192, 15, activation='relu', padding='same')(up4)

    up3 = UpSampling1D(size=2)(up_conv4)
    up3 = concatenate([up3, conv3], axis=-1)
    up_conv3 = Conv1D(96, 15, activation='relu', padding='same')(up3)

    up2 = UpSampling1D(size=2)(up_conv3)
    up2 = concatenate([up2, conv2], axis=-1)
    up_conv2 = Conv1D(48, 15, activation='relu', padding='same')(up2)

    up1 = UpSampling1D(size=2)(up_conv2)
    up1 = concatenate([up1, conv1], axis=-1)
    up_conv1 = Conv1D(24, 15, activation='relu', padding='same')(up1)

    outputs = Conv1D(2, 1, activation='sigmoid', padding='same')(up_conv1)

    model = Model(inputs=[inputs], outputs=[outputs])
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

    return model

model = wave_unet_model()
model.summary()

# Training the Model
model.fit(X_train, Y_train, epochs=50, batch_size=8, validation_split=0.2)


: 

: 

In [None]:
from pydub import AudioSegment
import speech_recognition as sr
import pandas as pd

# Function to read bad words from CSV with filtering options
def read_bad_words(csv_file, categories=None, min_severity=None):
    df = pd.read_csv(csv_file)
    if categories is not None:
        category_filter = df[['category_1', 'category_2', 'category_3']].apply(lambda x: any(item in categories for item in x if pd.notna(item)), axis=1)
        df = df[category_filter]
    if min_severity is not None:
        df = df[df['severity_rating'] >= 0]


    bad_words = set(df['text'].dropna().tolist())
    bad_words.update(df['canonical_form_1'].dropna().tolist())
    bad_words.update(df['canonical_form_2'].dropna().tolist())
    bad_words.update(df['canonical_form_3'].dropna().tolist())
    return bad_words

# Function to transcribe audio to text
def transcribe_audio(file_path):
    recognizer = sr.Recognizer()
    audio_file = sr.AudioFile(file_path)
    with audio_file as source:
        audio_data = recognizer.record(source)
    return recognizer.recognize_google(audio_data)

# Function to censor bad words in the audio
def censor_audio(input_audio_path, output_audio_path, bad_words_csv, categories=None, min_severity=None, beep_duration_ms=500):
    # Read bad words from CSV with filtering
    bad_words = read_bad_words(bad_words_csv, categories, min_severity)
    
    # Transcribe the audio
    transcription = transcribe_audio(input_audio_path)
    words = transcription.split()
    
    # Create a beep sound
    beep = AudioSegment.silent(duration=beep_duration_ms)
    
    # Load the audio file
    audio = AudioSegment.from_file(input_audio_path)
    
    # Initialize the start time for each word
    start_time = 0
    
    # Replace bad words with beep
    for word in words:
        duration = len(word) * (beep_duration_ms // 5)  # Approximate word duration
        if word.lower() in bad_words:
            audio = audio[:start_time] + beep + audio[start_time + duration:]
        start_time += duration
    
    # Export the censored audio
    audio.export(output_audio_path, format="wav")

# Example usage
input_audio_path = "isolated_vocal.wav"
output_audio_path = "censored_vocal.wav"
bad_words_csv = "profanity_en.csv"
categories = None  # None to include all categories
min_severity = 2.0
censor_audio(input_audio_path, output_audio_path, bad_words_csv, categories, min_severity)


In [None]:
import librosa


predicted_vocals, sr = librosa.load('predicted_vocals.wav', sr=44100)

librosa.output.write_wav("isolated_vocal.wav", predicted_vocals, sr)

# Censor the isolated vocal file
input_audio_path = "isolated_vocal.wav"
output_audio_path = "censored_vocal.wav"
bad_words_csv = "profanity_en.csv"
categories = None  # None to include all categories
min_severity = 0
censor_audio(input_audio_path, output_audio_path, bad_words_csv, categories, min_severity)
