In [1]:
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor
import torch
import os
import gc
import pickle
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRUTHFUL_PATH = '../../data/Acoustic/Truthful/'
DECEPTIVE_PATH = '../../data/Acoustic/Deceptive/'
TRUTHFUL_FILES = os.listdir(TRUTHFUL_PATH)
DECEPTIVE_FILES = os.listdir(DECEPTIVE_PATH)

In [7]:
def get_embeddings(model, processor):
    embeddings_map = {}
    for truthful_file in tqdm(TRUTHFUL_FILES):
        speech, rate = librosa.load(TRUTHFUL_PATH + truthful_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        features = model(input_values, output_hidden_states=True).hidden_states[-1].squeeze(0).mean(0).detach().numpy()
        embeddings_map[truthful_file] = features
        gc.collect()
    for deceptive_file in tqdm(DECEPTIVE_FILES):
        speech, rate = librosa.load(DECEPTIVE_PATH + deceptive_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        features = model(input_values, output_hidden_states=True).hidden_states[-1].squeeze(0).mean(0).detach().numpy()
        embeddings_map[deceptive_file] = features
        gc.collect()

100%|██████████| 60/60 [20:34<00:00, 20.57s/it]   
100%|██████████| 61/61 [17:39<00:00, 17.38s/it] 


In [8]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
embeddings_map = get_embeddings(model, processor)
with open('../../embeddings/acoustic_features_1.pkl', 'wb') as f:
    pickle.dump(embeddings_map, f)

In [16]:
EMBEDDINGS_PATH = '../../embeddings/AudioEmbeddings/'
EMBEDDINGS_FILES = os.listdir(EMBEDDINGS_PATH)
embeddings_map = {}
for file in EMBEDDINGS_FILES:
    with open(EMBEDDINGS_PATH + file, 'rb') as f:
        embeddings_map[file.split('.')[0] + '.wav'] = pickle.load(f).squeeze().mean(0).detach().numpy()
with open('../../embeddings/acoustic_features_2.pkl', 'wb') as f:
    pickle.dump(embeddings_map, f)

In [31]:
processor = Wav2Vec2Processor.from_pretrained("superb/hubert-base-superb-er")
model = HubertForCTC.from_pretrained("superb/hubert-base-superb-er")  
embeddings_map = get_embeddings(model, processor)
with open('../../embeddings/acoustic_features_3.pkl', 'wb') as f:
    pickle.dump(embeddings_map, f)

Downloading (…)rocessor_config.json: 100%|██████████| 213/213 [00:00<00:00, 19.4kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)lve/main/config.json: 100%|██████████| 1.66k/1.66k [00:00<00:00, 142kB/s]


OSError: Can't load tokenizer for 'superb/hubert-base-superb-er'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'superb/hubert-base-superb-er' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [4]:
class AudioClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AudioClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 64)
        self.linear2 = torch.nn.Linear(64, 2)
        # Xavier initialize the linear layer
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [5]:
class AudioClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AudioClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        # Xavier initialize the linear layer
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [6]:
class AudioDataset(Dataset):
    def __init__(self, data, embeddings_file):
        with open(embeddings_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.data = [embeddings_map[file] for file in data]
        self.labels = [0 if file.split('_')[1] == 'truth' else 1 for file in data]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [7]:
class AudioDatasetPCA(Dataset):
    def __init__(self, data, embeddings_file, pca, sc, test=False):
        with open(embeddings_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.data = np.array([embeddings_map[file] for file in data])
        if not test:
            self.data = sc.fit_transform(self.data)
            self.data = pca.fit_transform(self.data)
        else:
            self.data = sc.transform(self.data)
            self.data = pca.transform(self.data)
        self.data = torch.from_numpy(self.data).float()
        self.labels = [0 if file.split('_')[1] == 'truth' else 1 for file in data]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
def train(model, train_loader, criterion, optimizer, num_epochs):
    for _ in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [10]:
def eval(model, val_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [11]:
def kfold(embeddings_map_file, train_batch_size, do_pca, embedding_size):
    truthful_data = list(TRUTHFUL_FILES)
    deceptive_data = list(DECEPTIVE_FILES)
    for i in range(10):
        val_data = truthful_data[i*6:(i+1)*6] + deceptive_data[i*6:(i+1)*6]
        train_data = truthful_data[:i*6] + truthful_data[(i+1)*6:] + deceptive_data[:i*6] + deceptive_data[(i+1)*6:]
        if not do_pca:
            train_dataset = AudioDataset(train_data, embeddings_map_file)
            val_dataset = AudioDataset(val_data, embeddings_map_file)
        else:
            sc = StandardScaler()
            pca = PCA(n_components=embedding_size)
            train_dataset = AudioDatasetPCA(train_data, embeddings_map_file, pca, sc)
            val_dataset = AudioDatasetPCA(val_data, embeddings_map_file, pca, sc, test=True)
        train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=12)
        yield train_loader, val_loader

In [12]:
def run_one_fold(train_loader, val_loader, num_epochs, embedding_size, do_pca):
    if not do_pca:
        model = AudioClassifier(embedding_size)
    else:
        model = AudioClassifierPCA(embedding_size)
    model.to(device)
    learning_rate = 1e-3
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train(model, train_loader, criterion, optimizer, num_epochs)
    return eval(model, val_loader)

In [13]:
def get_model_accuracy(embeddings_map_file, train_batch_size, num_epochs, embedding_size = 1024, do_pca = False):
    accuracies = []
    for train_loader, val_loader in kfold(embeddings_map_file, train_batch_size, do_pca, embedding_size):
        accuracies.append(run_one_fold(train_loader, val_loader, num_epochs, embedding_size, do_pca))
    return sum(accuracies) / len(accuracies)

In [102]:
# Model accuracy for hubert-large-ls960-ft 
get_model_accuracy('../../embeddings/acoustic_features_1.pkl', 4, 40)

0.5333333333333333

In [27]:
# Model accuracy for hubert-large-ls960-ft with pca
get_model_accuracy('../../embeddings/acoustic_features_1.pkl', 2, 40, 64, True)

0.625

In [29]:
# Model accuracy for hubert-large-ls960-ft with pca
get_model_accuracy('../../embeddings/acoustic_features_1.pkl', 2, 30, 64, True)

0.5666666666666667

In [30]:
# Model accuracy for hubert-large-ls960-ft with pca
get_model_accuracy('../../embeddings/acoustic_features_1.pkl', 4, 30, 64, True)

0.5833333333333333

In [26]:
# Model accuracy for hubert-large-ls960-ft with pca
get_model_accuracy('../../embeddings/acoustic_features_1.pkl', 4, 40, 64, True)

0.625

In [27]:
# Model accuracy for vggish
get_model_accuracy('../../embeddings/acoustic_features_2.pkl', 4, 30, 128)

0.5833333333333333

In [28]:
# Model accuracy for vggish with pca
get_model_accuracy('../../embeddings/acoustic_features_2.pkl', 4, 30, 64, True)

0.5833333333333333

In [29]:
# Model accuracy for vggish
get_model_accuracy('../../embeddings/acoustic_features_2.pkl', 4, 40, 128)

0.6000000000000001

In [30]:
# Model accuracy for vggish with pca
get_model_accuracy('../../embeddings/acoustic_features_2.pkl', 4, 40, 64, True)

0.5916666666666667