In [1]:
# make sure in the project's roots' directory
%cd ../
%pwd

/Users/haozhezh/Documents/Research/CMU/AudioFeaturizationAttack/Kirigami_Publish/Kirigami-private-audio


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


'/Users/haozhezh/Documents/Research/CMU/AudioFeaturizationAttack/Kirigami_Publish/Kirigami-private-audio'

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence, unpad_sequence
from torch import Tensor
import numpy as np
import pandas as pd
import scipy as sp
import soundfile
import os

## Background Detector

### Prepare dataset

In [3]:
def fft_features_with_tags(wav_form, bg_start, bg_end, window_size=256, non_overlap=128):
  _, _, stft = sp.signal.stft(x=wav_form, fs=16000, nperseg=window_size, noverlap=non_overlap)
  speech_start_window = (bg_start - window_size + non_overlap) // non_overlap
  speech_end_window = (bg_end-window_size+non_overlap) // non_overlap
  features = []
  stft = stft.transpose((1, 0))
  tags = np.zeros(len(stft))
  tags[:] = 0
  tags[speech_start_window:speech_end_window] = 1
  stft = np.abs(stft)
  for fft in stft:
    features.append(fft)
  return features, tags

def load_audio(file: str):
    # Load Audio # lower sampling rate
    audio_samples, sp = soundfile.read(file)
    return audio_samples

class AudiosetDataset(Dataset):
    def __init__(self, bf_detect_df):
        self.audio_files_list = []
        self.start_end_list = []    # 3-tuple: (wrd_start, wrd_end, wrd_start2); wrd_end2 is the end of audio
        for idx, row in bf_detect_df.iterrows():
          self.audio_files_list.append(row['wav'])
          # self.audio_files_list.append(interleave_wavs_path + row['wav'].replace('./timit_background_detector_training_one_level_snr', ''))
          self.start_end_list.append((row['bg_start'],row['bg_end']))

    def __getitem__(self, index):
        audio = load_audio(self.audio_files_list[index])
        wrd_start_loc, wrd_end_loc = self.start_end_list[index]
        audio_features, audio_tags = fft_features_with_tags(audio, wrd_start_loc, wrd_end_loc)
        return Tensor(np.asarray(audio_features)).to(DEVICE), Tensor(audio_tags).to(DEVICE)

    def __len__(self):
        return len(self.audio_files_list)

def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)
    return xx_pad, yy_pad, torch.tensor(x_lens, dtype=torch.float32), torch.tensor(y_lens, dtype=torch.float32)

def get_data_prep_dataset_dataloader(bf_detect_df, batch_size=32):
  dataset = AudiosetDataset(bf_detect_df)
  train_set, val_set, test_set = random_split(dataset, [0.8, 0.1, 0.1])
  train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
  val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
  test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
  return train_loader, val_loader, test_loader

In [4]:
BF_WAV_PATH = './datasets/background_detect/'
BF_CSV_PATH = './datasets/background_detect/background_detector_dataset.csv'

batch_size = 32
model_path = "./results/bf_detect/"
if not os.path.exists(model_path):
    os.makedirs(model_path)

my_bf_detect_df = pd.read_csv(BF_CSV_PATH)
train_loader, val_loader, test_loader = get_data_prep_dataset_dataloader(my_bf_detect_df, batch_size=batch_size)

In [5]:
class LogisticRegressionClassifier(torch.nn.Module):
    def __init__(self, feature_dim=129):
        super(LogisticRegressionClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(feature_dim, 1)
        self.sigmoid = torch.nn.Sigmoid()
    def forward(self, xx):
        xx = self.linear1(torch.nn.functional.normalize(xx, p=1.0, dim = 1))
        return self.sigmoid(xx)

### Train the speech detector model

In [9]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
my_model = LogisticRegressionClassifier(feature_dim=129).to(DEVICE)

In [10]:
bce = torch.nn.BCELoss()
optim =  torch.optim.Adam(my_model.parameters(), lr=0.001)

In [11]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
max_epoch=20

validating_frequency = 1
epoch = 0

s = tqdm(range(0, int(max_epoch)),desc='Training Epochs')

for epoch in s:
    my_model.train()
    total_loss = 0
    total_samples = 0
    current_count_train = 0
    for x_padded, y_padded, x_lens, y_lens in train_loader:
        current_count_train = current_count_train + 1
        training_percent = 100 * current_count_train / len(train_loader)

        unpadded_x = unpad_sequence(x_padded, x_lens, batch_first=True,)
        unpadded_y = unpad_sequence(y_padded, y_lens, batch_first=True,)

        concat_x = torch.cat(unpadded_x, dim=0)
        concat_y = torch.cat(unpadded_y, dim=0)
        # print(concat_x.dtype)
        concat_y_pred = my_model(concat_x)[:, 0]

        current_loss = bce(concat_y_pred, concat_y)
        current_loss.backward()
        optim.step()
        total_loss = total_loss + (current_loss.detach().cpu().numpy()) * concat_x.shape[0]
        total_samples = total_samples + concat_x.shape[0]
        batch_train_accuracy_privacy = (torch.sum((concat_y == 1) & (concat_y_pred >= 0.5)) + torch.sum((concat_y  == 0) & (concat_y_pred < 0.5)))/ (concat_y.shape[0])

        s.set_description(f"Training...{training_percent:.2f}%, Acc{batch_train_accuracy_privacy:.2f}")

    if (epoch+1) % validating_frequency == 0:
        # train profile assigment DT model using the optimal training set membership
        my_model.eval()

        total_y_pred = []
        total_y_truth = []
        for x_padded, y_padded, x_lens, y_lens in val_loader:
          unpadded_x = unpad_sequence(x_padded, x_lens, batch_first=True,)
          unpadded_y = unpad_sequence(y_padded, y_lens, batch_first=True,)
          concat_x = torch.cat(unpadded_x, dim=0)
          concat_y = torch.cat(unpadded_y, dim=0)
          concat_y_pred = my_model(concat_x)[:, 0] > 0.5

          total_y_pred.extend(concat_y_pred.detach().cpu().numpy())
          total_y_truth.extend(concat_y.detach().cpu().numpy())

        valid_accuracy = accuracy_score(total_y_truth, total_y_pred)
        s.set_postfix(validation_accuracy = valid_accuracy)
        torch.save(my_model.state_dict(), f"{model_path}/background_detector{epoch}.ckpt")


Training...66.67%, Acc0.68:  55%|█████▌    | 11/20 [05:38<04:36, 30.76s/it, validation_accuracy=0.705] 


KeyboardInterrupt: 

In [12]:
my_model.eval()
total_y_pred = []
total_y_truth = []
total_y_prob = []
for x_padded, y_padded, x_lens, y_lens in test_loader:
  unpadded_x = unpad_sequence(x_padded, x_lens, batch_first=True)
  unpadded_y = unpad_sequence(y_padded, y_lens, batch_first=True)
  concat_x = torch.cat(unpadded_x, dim=0)
  concat_y = torch.cat(unpadded_y, dim=0)
  concat_y_pred = my_model(concat_x)[:, 0] > 0.5
  concat_y_prob = my_model(concat_x)[:, 0]
  total_y_pred.extend(concat_y_pred.detach().cpu().numpy())
  total_y_truth.extend(concat_y.detach().cpu().numpy())
test_accuracy = accuracy_score(total_y_truth, total_y_pred)
print("Test_Accuracy", test_accuracy)

Test_Accuracy 0.7056482633941394
