In [1]:
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor
import torch
import os
import gc
import pickle
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").cuda()  

In [5]:
TRUTHFUL_PATH = '../../Dataset2/Audio/Truth/'
DECEPTIVE_PATH = '../../Dataset2/Audio/Lies/'
TRUTHFUL_FILES = os.listdir(TRUTHFUL_PATH)
DECEPTIVE_FILES = os.listdir(DECEPTIVE_PATH)

In [6]:
gc.collect()
embeddings_map = {}
for truthful_file in tqdm(TRUTHFUL_FILES):
    speech, rate = librosa.load(TRUTHFUL_PATH + truthful_file, sr=16000)
    input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
    input_values = input_values.cuda()
    features = model(input_values, output_hidden_states=True).hidden_states[-1].squeeze(0).mean(0).detach().numpy()
    embeddings_map[truthful_file] = features
    gc.collect()
for deceptive_file in tqdm(DECEPTIVE_FILES):
    speech, rate = librosa.load(DECEPTIVE_PATH + deceptive_file, sr=16000)
    input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
    input_values = input_values.cuda()
    features = model(input_values, output_hidden_states=True).hidden_states[-1].squeeze(0).mean(0).detach().numpy()
    embeddings_map[deceptive_file] = features
    gc.collect()

  0%|          | 0/74 [00:03<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.83 GiB (GPU 0; 4.00 GiB total capacity; 1.20 GiB already allocated; 2.02 GiB free; 1.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
with open('../../embeddings2/acoustic_features_1.pkl', 'wb') as f:
    pickle.dump(embeddings_map, f)

In [22]:
DIR = '../../embeddings2/AudioFeatures/'
files = os.listdir(DIR)
embeddings_map = {}
for file in tqdm(files):
    with open(DIR + file, 'rb') as f:
        key = file.split('.')[0]
        key = '-'.join(key.split('-')[:3])
        embeddings_map[key] = pickle.load(f).squeeze().mean(0).detach().numpy()
with open('../../embeddings2/acoustic_features_2.pkl', 'wb') as f:
    pickle.dump(embeddings_map, f)

100%|██████████| 149/149 [00:00<00:00, 1112.03it/s]


In [8]:
class AudioClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AudioClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 64)
        self.linear2 = torch.nn.Linear(64, 2)
        # Xavier initialize the linear layer
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [9]:
class AudioClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AudioClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        # Xavier initialize the linear layer
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [10]:
class AudioDataset(Dataset):
    def __init__(self, data, embeddings_file):
        with open(embeddings_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.labels = [x[1] for x in data]
        data = [x[0] for x in data]
        self.data = [embeddings_map[file] for file in data]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [11]:
class AudioDatasetPCA(Dataset):
    def __init__(self, data, embeddings_file, pca, sc, test=False):
        with open(embeddings_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.labels = [x[1] for x in data]
        data = [x[0] for x in data]
        self.data = np.array([embeddings_map[file] for file in data])
        if not test:
            self.data = sc.fit_transform(self.data)
            self.data = pca.fit_transform(self.data)
        else:
            self.data = sc.transform(self.data)
            self.data = pca.transform(self.data)
        self.data = torch.from_numpy(self.data).float()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
def train(model, train_loader, criterion, optimizer, num_epochs):
    for _ in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [14]:
def eval(model, test_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [18]:
train_data = []
for file in os.listdir('../../data/Acoustic/Truthful/'):
    train_data.append((file, 0))
for file in os.listdir('../../data/Acoustic/Deceptive/'):
    train_data.append((file, 1))

test_data = []
for file in os.listdir('../../Dataset2/Audio/Truth/'):
    key = file.split('.')[0]
    key = '-'.join(key.split('-')[:3])
    test_data.append((key, 0))
for file in os.listdir('../../Dataset2/Audio/Lies/'):
    key = file.split('.')[0]
    key = '-'.join(key.split('-')[:3])
    test_data.append((key, 1))

In [19]:
def get_accuracy(train_embedding_file, test_embedding_file, do_pca, embedding_size, train_batch_size, num_epochs):
    if not do_pca:
        train_dataset = AudioDataset(train_data, train_embedding_file)
        test_dataset = AudioDataset(test_data, test_embedding_file)
        model = AudioClassifier(embedding_size).to(device)
    else:
        pca = PCA(n_components=embedding_size)
        sc = StandardScaler()
        train_dataset = AudioDatasetPCA(train_data, train_embedding_file, pca, sc, False)
        test_dataset = AudioDatasetPCA(test_data, test_embedding_file, pca, sc, True)
        model = AudioClassifierPCA(embedding_size).to(device)
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    learning_rate = 1e-3
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train(model, train_loader, criterion, optimizer, num_epochs)
    return eval(model, test_loader)

In [23]:
# vggish
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings2/acoustic_features_2.pkl', False, 128, 4, 40)

0.5771812080536913

In [24]:
# vggish with pca
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings2/acoustic_features_2.pkl', True, 64, 4, 40)

0.5167785234899329