In [45]:
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement

mask_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="pretrained_models/metricgan-plus-voicebank",
)

# Load and add fake batch dimension
noisy = mask_model.load_audio(
    "speechbrain/metricgan-plus-voicebank/example.wav"
).unsqueeze(0)

# Add relative length tensor
enhanced = mask_model.enhance_batch(noisy, lengths=torch.tensor([1.]))

# Saving enhanced signal on disk
torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)


In [4]:
import os
import json
import librosa


noisy_folder = 'C:/Users/vesha/PycharmProjects/pC/HOH/noisy'
denoised_folder = 'C:/Users/vesha/PycharmProjects/pC/HOH/clean'
output_file = 'C:/Users/vesha/PycharmProjects/pC/output.json'

dataset = {}

# Iterate over the files in the noisy folder
for file_name in os.listdir(noisy_folder):
    if file_name.endswith('.wav'):  # Adjust the file extension if necessary
        file_path_noisy = os.path.join(noisy_folder, file_name)
        file_path_denoised = os.path.join(denoised_folder, file_name)
        length = int(librosa.get_duration(path=file_path_noisy) * 1000)

        data_entry = {
            
            'noisy_path': file_path_noisy,
            'denoised_path': file_path_denoised,
            'length': length
        }

        dataset[file_name] = data_entry

# Save the dataset as a JSON file
with open(output_file, 'w') as json_file:
    json.dump(dataset, json_file, indent=4)

KeyboardInterrupt: 

In [46]:
import speechbrain
import torch
from speechbrain.dataio.dataset import DynamicItemDataset

dataset_test = DynamicItemDataset.from_json("output2.json")

@speechbrain.utils.data_pipeline.takes("noisy_path")
@speechbrain.utils.data_pipeline.provides("noisy_signal", "enhance")
def audio_pipeline_noisy(noisy_path):
      noisy_signal = mask_model.load_audio(noisy_path).unsqueeze(0)
      enhance = mask_model.enhance_batch(noisy_signal, lengths=torch.tensor([1.]))
      return noisy_signal, enhance
@speechbrain.utils.data_pipeline.takes("denoised_path")
@speechbrain.utils.data_pipeline.provides("clean_signal")
def audio_pipeline_clean(denoised_path):
        clean_signal = mask_model.load_audio(denoised_path).unsqueeze(0)
        return clean_signal

In [47]:
dataset_test.add_dynamic_item(audio_pipeline_noisy)
dataset_test.add_dynamic_item(audio_pipeline_clean)
dataset_test.set_output_keys(["id", "noisy_signal", "clean_signal", "enhance"])
dataset_test[0]["enhance"]

tensor([[ 6.1155e-04,  3.4434e-03, -8.1838e-07,  ...,  1.0411e-01,
          9.9643e-02,  9.0381e-02]])

In [49]:
len(dataset_test)
#print(dataset[0]["noisy_signal"].size(), dataset[0]["clean_signal"].size())

8

In [50]:

print(dataset_test[0]["noisy_signal"].size())
print(dataset_test[0]["clean_signal"].size())
print(dataset_test[0]["enhance"].size())

torch.Size([1, 29916])
torch.Size([1, 29916])
torch.Size([1, 29916])


In [1]:

import pesq
import numpy as np
clean_signal = dataset_test[0]["clean_signal"].numpy().squeeze()
#noisy_signal = noisy_signal.numpy()
denoised_signal = dataset_test[0]["enhance"].numpy().squeeze()
print(denoised_signal.shape)
pesq_score = pesq.pesq(16000, denoised_signal, clean_signal, 'wb')
print(pesq_score)

NameError: name 'dataset_test' is not defined

In [53]:
import pesq

In [2]:
total_pesq = 0
pesqs=[]
output_file = 'C:/Users/vesha/PycharmProjects//logs/origin_pesq.txt'
for i in dataset_test:
    clean_signal =i["clean_signal"].numpy().squeeze()
    denoised_signal = i["enhance"].numpy().squeeze()
    pesq_score = pesq.pesq(16000, clean_signal, denoised_signal, 'wb')
    pesqs.append(pesq_score)
    total_pesq=pesq_score+total_pesq
    print(pesq_score)
    
average_pesq = total_pesq / len(dataset_test)
print(f"Average pesq: {average_pesq}")

with open(output_file, 'w') as file:
    for score in pesqs:
        file.write(str(score) + '\n')

NameError: name 'dataset_test' is not defined

In [64]:
mask_model.mods.keys()

odict_keys(['enhance_model'])

In [65]:
mask_model.mods.enhance_model

EnhancementGenerator(
  (activation): LeakyReLU(negative_slope=0.3)
  (blstm): LSTM(
    (rnn): LSTM(257, 200, num_layers=2, batch_first=True, bidirectional=True)
  )
  (linear1): Linear(in_features=400, out_features=300, bias=True)
  (linear2): Linear(in_features=300, out_features=257, bias=True)
  (Learnable_sigmoid): Learnable_sigmoid()
  (sigmoid): Sigmoid()
)

In [66]:
dir(mask_model.hparams)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'compute_istft',
 'compute_stft',
 'enhance_model',
 'hop_length',
 'modules',
 'n_fft',
 'pretrainer',
 'resynth',
 'sample_rate',
 'spectral_magnitude',
 'win_length',
 'window_fn']

In [67]:
mask_model.mods

ModuleDict(
  (enhance_model): EnhancementGenerator(
    (activation): LeakyReLU(negative_slope=0.3)
    (blstm): LSTM(
      (rnn): LSTM(257, 200, num_layers=2, batch_first=True, bidirectional=True)
    )
    (linear1): Linear(in_features=400, out_features=300, bias=True)
    (linear2): Linear(in_features=300, out_features=257, bias=True)
    (Learnable_sigmoid): Learnable_sigmoid()
    (sigmoid): Sigmoid()
  )
)

In [68]:
import speechbrain
import torch
from speechbrain.dataio.dataset import DynamicItemDataset

dataset_train = DynamicItemDataset.from_json("output1.json")
dataset_test = DynamicItemDataset.from_json("output2.json")
#dataset_test = DynamicItemDataset.from_json("output3.json")
@speechbrain.utils.data_pipeline.takes("noisy_path","denoised_path" )
@speechbrain.utils.data_pipeline.provides("signal", "enhanced_signal")
def audio_pipeline(noisy_path,denoised_path):
    signal = mask_model.load_audio(noisy_path)
    yield signal
    enhanced_signal = mask_model.load_audio(denoised_path)
    yield enhanced_signal
      

In [69]:
dataset_train.add_dynamic_item(audio_pipeline)
dataset_train.set_output_keys(["id", "signal", "enhanced_signal"])

dataset_test.add_dynamic_item(audio_pipeline)
dataset_test.set_output_keys(["id", "signal", "enhanced_signal"])

In [70]:
print(len(dataset_test), len(dataset_train))
print(dataset_test[0]["enhanced_signal"].size())


8 56
torch.Size([29916])


In [71]:
from speechbrain.processing.features import spectral_magnitude
from speechbrain.processing.signal_processing import resynthesize
import librosa
import  speechbrain as sb
import os
import json
import numpy as np
import pesq
class MaskEnhancementFineTune(sb.Brain):
    def on_stage_start(self, stage, epoch):
        # enable grad for all modules we want to fine-tune
        if stage == sb.Stage.TRAIN:
            for module in [self.modules.mask_model, self.modules.compute_stft,self.modules.compute_istft ]:
                for p in module.parameters():
                    p.requires_grad = True
        if stage == sb.Stage.TRAIN:
            self.train_loss_history = []
            self.train_pesq_history = []
        if stage == sb.Stage.TEST:
            self.test_loss_history = []
            self.test_pesq_history = []
        
            
            
    def compute_feats(self, wavs):
        """Feature computation pipeline"""
        feats = self.modules.compute_stft(wavs)
        feats = spectral_magnitude(feats, power=0.7)
        feats = torch.log1p(feats)
        return feats
    def compute_forward(self, batch, stage):
        """Forward computations from the input signal to the enhanced signal."""
        
        batch = batch.to(self.device)
        noisy_signal, noisy_lens = batch.signal
        # # Forward pass
        noisy_spec = self.compute_feats(noisy_signal)        
        mask = self.modules.mask_model.mods.enhance_model(noisy_spec, noisy_lens)
        predict_spec = torch.mul(mask, noisy_spec)
        predict_wav = self.modules.mask_model.hparams.resynth(torch.expm1(predict_spec), noisy_signal)

        return predict_wav
    def on_stage_end(self, stage, stage_loss, epoch):
        if stage == sb.Stage.TRAIN:
            self.save_training_logs("logs/train")
        elif stage == sb.Stage.TEST:
            self.save_validation_logs("logs/test")
        
    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss (MSE) given predictions and targets."""
       
        predict_wav = predictions
        predict_spec = self.compute_feats(predict_wav)
        clean_wav, lens = batch.enhanced_signal
        clean_spec = self.compute_feats(clean_wav)
        # Adjust the shape of the enhanced signals and targets
        loss = self.hparams.mse_loss(predict_spec, clean_spec)
        predict_wav = predict_wav.detach().cpu().numpy()
        clean_wav = clean_wav.detach().cpu().numpy()
        pesq_score = np.mean([pesq.pesq(fs=16000, ref=clean_wav[i], deg=predict_wav[i], mode='wb') for i in range(len(clean_wav))])
        print(pesq_score)
        print(loss)
        return loss, pesq_score
    
    def fit_batch(self, batch):
        
        """Train the parameters given a single batch in input"""
       
        predictions = self.compute_forward(batch, 
              sb.Stage.TRAIN)

        loss, pesq_score  = self.compute_objectives(predictions, batch, sb.Stage.TRAIN )
        
        loss.backward()

        if self.check_gradients(loss):
            self.optimizer.step()
        self.optimizer.zero_grad()
        self.train_loss_history.append(loss.detach())
        self.train_pesq_history.append(pesq_score)
        
        return loss
        
    def evaluate_batch(self, batch, stage):
        predictions = self.compute_forward(batch, 
              sb.Stage.TRAIN)

        loss, pesq_score  = self.compute_objectives(predictions, batch, sb.Stage.TRAIN )
        self.test_loss_history.append(loss.detach())
        self.test_pesq_history.append(pesq_score)
        return loss
    
        
    def save_training_logs(self, logs_dir):
        train_loss_path = os.path.join("C:/Users/vesha/PycharmProjects/pC/logs", "train_loss.txt")  
        train_pesq_path = os.path.join("C:/Users/vesha/PycharmProjects/pC/logs", "train_pesq.txt")
        with open(train_loss_path, "w") as f:
            f.write("\n".join(str(loss) for loss in self.train_loss_history))
             
        with open(train_pesq_path, "w") as f:
            f.write("\n".join(str(pesq) for pesq in self.train_pesq_history))
            
    def save_validation_logs(self, logs_dir):
        valid_loss_path = os.path.join("C:/Users/vesha/PycharmProjects/pC/logs", "val_loss.txt", )
        valid_pesq_path = os.path.join("C:/Users/vesha/PycharmProjects/pC/logs", "val_pesq.txt")
        with open(valid_loss_path, "w") as f:
            f.write("\n".join(str(loss) for loss in self.test_loss_history))
             
        with open(valid_pesq_path, "w") as f:
            f.write("\n".join(str(pesq) for pesq in self.test_pesq_history))

In [72]:
import pesq
modules = {"mask_model": mask_model,
           "compute_stft": mask_model.hparams.compute_stft,
           "compute_istft": mask_model.hparams.compute_istft
           }
hparams = {"mse_loss": torch.nn.MSELoss(),
           "pesq": pesq}
MyModel = MaskEnhancementFineTune(modules, hparams=hparams, opt_class=lambda x: torch.optim.SGD(x, 1e-5))

In [73]:
MyModel.fit(range(2),
    #valid_set=dataset_test,
    train_set=dataset_train,
    train_loader_kwargs={"batch_size": 3, "drop_last": True, "shuffle": False},                     #valid_loader_kwargs={"batch_size": 1, "drop_last": True, "shuffle": False}
 )
#print(eval_stats)

  0%|          | 0/18 [00:00<?, ?it/s]

1.6561397314071655
tensor(0.0201, grad_fn=<MseLossBackward0>)


  6%|▌         | 1/18 [00:06<01:55,  6.77s/it, train_loss=0.0201]

1.2050107320149739
tensor(0.0705, grad_fn=<MseLossBackward0>)


 11%|█         | 2/18 [00:11<01:27,  5.47s/it, train_loss=0.0453]

1.6141529480616252
tensor(0.0292, grad_fn=<MseLossBackward0>)


 17%|█▋        | 3/18 [00:16<01:19,  5.32s/it, train_loss=0.0399]

1.4009600083033245
tensor(0.0279, grad_fn=<MseLossBackward0>)


 22%|██▏       | 4/18 [00:22<01:19,  5.67s/it, train_loss=0.0369]

1.6452123324076335
tensor(0.0203, grad_fn=<MseLossBackward0>)


 28%|██▊       | 5/18 [00:36<01:51,  8.54s/it, train_loss=0.0336]

1.5832857688268025
tensor(0.0365, grad_fn=<MseLossBackward0>)


 33%|███▎      | 6/18 [00:40<01:25,  7.12s/it, train_loss=0.0341]

1.7570412556330364
tensor(0.0338, grad_fn=<MseLossBackward0>)


 39%|███▉      | 7/18 [00:44<01:06,  6.08s/it, train_loss=0.034] 

1.7323226531346638
tensor(0.0305, grad_fn=<MseLossBackward0>)


 44%|████▍     | 8/18 [00:49<00:58,  5.83s/it, train_loss=0.0336]

1.2773929437001545
tensor(0.0440, grad_fn=<MseLossBackward0>)


 50%|█████     | 9/18 [00:54<00:48,  5.38s/it, train_loss=0.0347]

1.5704311927159627
tensor(0.0284, grad_fn=<MseLossBackward0>)


 56%|█████▌    | 10/18 [00:58<00:40,  5.03s/it, train_loss=0.0341]

1.7897350788116455
tensor(0.0303, grad_fn=<MseLossBackward0>)


 61%|██████    | 11/18 [01:01<00:31,  4.51s/it, train_loss=0.0338]

1.7454382578531902
tensor(0.0223, grad_fn=<MseLossBackward0>)


 67%|██████▋   | 12/18 [01:05<00:26,  4.36s/it, train_loss=0.0328]

1.895368218421936
tensor(0.0239, grad_fn=<MseLossBackward0>)


 72%|███████▏  | 13/18 [01:10<00:22,  4.53s/it, train_loss=0.0321]

1.9185400009155273
tensor(0.0275, grad_fn=<MseLossBackward0>)


 78%|███████▊  | 14/18 [01:15<00:18,  4.72s/it, train_loss=0.0318]

1.9788245757420857
tensor(0.0296, grad_fn=<MseLossBackward0>)


 83%|████████▎ | 15/18 [01:21<00:14,  4.91s/it, train_loss=0.0316]

1.91318941116333
tensor(0.0199, grad_fn=<MseLossBackward0>)


 89%|████████▉ | 16/18 [01:28<00:11,  5.64s/it, train_loss=0.0309]

1.435445745786031
tensor(0.0215, grad_fn=<MseLossBackward0>)


 94%|█████████▍| 17/18 [01:35<00:06,  6.14s/it, train_loss=0.0304]

1.7011317412058513
tensor(0.0310, grad_fn=<MseLossBackward0>)


100%|██████████| 18/18 [01:45<00:00,  5.88s/it, train_loss=0.0304]
  0%|          | 0/18 [00:00<?, ?it/s]

1.6561453342437744
tensor(0.0201, grad_fn=<MseLossBackward0>)


  6%|▌         | 1/18 [00:09<02:35,  9.16s/it, train_loss=0.0201]

1.2050151824951172
tensor(0.0705, grad_fn=<MseLossBackward0>)


 11%|█         | 2/18 [00:16<02:08,  8.01s/it, train_loss=0.0453]

1.6141589482625325
tensor(0.0292, grad_fn=<MseLossBackward0>)


 17%|█▋        | 3/18 [00:22<01:50,  7.37s/it, train_loss=0.0399]

1.4009619156519573
tensor(0.0279, grad_fn=<MseLossBackward0>)


 22%|██▏       | 4/18 [00:34<02:07,  9.11s/it, train_loss=0.0369]

1.645210901896159
tensor(0.0203, grad_fn=<MseLossBackward0>)


 28%|██▊       | 5/18 [00:48<02:20, 10.81s/it, train_loss=0.0336]

1.5832861264546711
tensor(0.0365, grad_fn=<MseLossBackward0>)


 33%|███▎      | 6/18 [00:52<01:39,  8.31s/it, train_loss=0.0341]

1.7570470968882244
tensor(0.0338, grad_fn=<MseLossBackward0>)


 39%|███▉      | 7/18 [00:56<01:18,  7.13s/it, train_loss=0.034] 

1.7323216199874878
tensor(0.0305, grad_fn=<MseLossBackward0>)


 44%|████▍     | 8/18 [01:02<01:08,  6.83s/it, train_loss=0.0336]

1.277393897374471
tensor(0.0440, grad_fn=<MseLossBackward0>)


 50%|█████     | 9/18 [01:08<00:57,  6.38s/it, train_loss=0.0347]

1.5704344908396404
tensor(0.0284, grad_fn=<MseLossBackward0>)


 56%|█████▌    | 10/18 [01:12<00:45,  5.74s/it, train_loss=0.0341]

1.7897495031356812
tensor(0.0303, grad_fn=<MseLossBackward0>)


 61%|██████    | 11/18 [01:15<00:34,  4.97s/it, train_loss=0.0338]

1.7454412778218586
tensor(0.0223, grad_fn=<MseLossBackward0>)


 67%|██████▋   | 12/18 [01:19<00:27,  4.60s/it, train_loss=0.0328]

1.8953773180643718
tensor(0.0239, grad_fn=<MseLossBackward0>)


 72%|███████▏  | 13/18 [01:25<00:24,  4.88s/it, train_loss=0.0321]

1.9185267289479573
tensor(0.0275, grad_fn=<MseLossBackward0>)


 78%|███████▊  | 14/18 [01:30<00:20,  5.03s/it, train_loss=0.0318]

1.9788331985473633
tensor(0.0296, grad_fn=<MseLossBackward0>)


 83%|████████▎ | 15/18 [01:35<00:15,  5.08s/it, train_loss=0.0316]

1.913197676340739
tensor(0.0199, grad_fn=<MseLossBackward0>)


 89%|████████▉ | 16/18 [01:43<00:11,  5.86s/it, train_loss=0.0309]

1.4354498783747356
tensor(0.0215, grad_fn=<MseLossBackward0>)


 94%|█████████▍| 17/18 [01:50<00:06,  6.26s/it, train_loss=0.0304]

1.7011350393295288
tensor(0.0310, grad_fn=<MseLossBackward0>)


100%|██████████| 18/18 [01:56<00:00,  6.48s/it, train_loss=0.0304]


In [74]:
MyModel.evaluate(test_set= dataset_test,
        test_loader_kwargs={"batch_size": 1, "drop_last": True, "shuffle": False})

 12%|█▎        | 1/8 [00:00<00:06,  1.01it/s]

2.1754937171936035
tensor(0.0455)


 25%|██▌       | 2/8 [00:02<00:08,  1.34s/it]

1.6018834114074707
tensor(0.0298)


 38%|███▊      | 3/8 [00:04<00:07,  1.55s/it]

1.2756019830703735
tensor(0.0466)


 50%|█████     | 4/8 [00:05<00:05,  1.40s/it]

1.03309965133667
tensor(0.0753)


 62%|██████▎   | 5/8 [00:07<00:04,  1.64s/it]

1.249719262123108
tensor(0.0614)


 75%|███████▌  | 6/8 [00:08<00:02,  1.39s/it]

1.2876050472259521
tensor(0.0676)


 88%|████████▊ | 7/8 [00:09<00:01,  1.22s/it]

1.3024944067001343
tensor(0.0795)


100%|██████████| 8/8 [00:10<00:00,  1.27s/it]

1.140517234802246
tensor(0.0948)





0.06255585281178355

In [75]:
import torch
model = MaskEnhancementFineTune()
# Save the model parameters
torch.save(model, 'C:/Users/vesha/PycharmProjects/pC/model/FineTuned.pth')