In [5]:
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor
import torch
import os
import gc
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [6]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  

In [7]:
embeddings_map = {}
TRUTHFUL_PATH = '../../data/Acoustic/Truthful/'
DECEPTIVE_PATH = '../../data/Acoustic/Deceptive/'
TRUTHFUL_FILES = os.listdir(TRUTHFUL_PATH)
DECEPTIVE_FILES = os.listdir(DECEPTIVE_PATH)
for truthful_file in tqdm(TRUTHFUL_FILES):
    speech, rate = librosa.load(TRUTHFUL_PATH + truthful_file, sr=16000)
    input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
    features = model(input_values, output_hidden_states=True).hidden_states[-1].squeeze(0).mean(0).detach().numpy()
    embeddings_map[truthful_file] = features
    gc.collect()
for deceptive_file in tqdm(DECEPTIVE_FILES):
    speech, rate = librosa.load(DECEPTIVE_PATH + deceptive_file, sr=16000)
    input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
    features = model(input_values, output_hidden_states=True).hidden_states[-1].squeeze(0).mean(0).detach().numpy()
    embeddings_map[deceptive_file] = features
    gc.collect()

100%|██████████| 60/60 [20:34<00:00, 20.57s/it]   
100%|██████████| 61/61 [17:39<00:00, 17.38s/it] 


In [8]:
with open('../../embeddings/acoustic_features_1.pkl', 'wb') as f:
    pickle.dump(embeddings_map, f)

In [20]:
class AudioClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AudioClassifier, self).__init__()
        self.linear = torch.nn.Linear(embedding_size, 2)
        # Xavier initialize the linear layer
        torch.nn.init.xavier_uniform_(self.linear.weight)
    
    def forward(self, x):
        return self.linear(torch.nn.functional.relu(x))

In [18]:
class AudioDataset(Dataset):
    def __init__(self, data, embeddings_file):
        with open(embeddings_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.data = [embeddings_map[file] for file in data]
        self.labels = [0 if file.split('_')[0] == 'truthful' else 1 for file in data]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
def train(model, train_loader, criterion, optimizer, num_epochs):
    for _ in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [13]:
def eval(model, val_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [38]:
def kfold(embeddings_map_file, train_batch_size):
    truthful_data = list(TRUTHFUL_FILES)
    deceptive_data = list(DECEPTIVE_FILES)
    for i in range(10):
        val_data = truthful_data[i*6:(i+1)*6] + deceptive_data[i*6:(i+1)*6]
        train_data = truthful_data[:i*6] + truthful_data[(i+1)*6:] + deceptive_data[:i*6] + deceptive_data[(i+1)*6:]
        train_dataset = AudioDataset(train_data, embeddings_map_file)
        val_dataset = AudioDataset(val_data, embeddings_map_file)
        train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=12, shuffle=True)
        yield train_loader, val_loader

In [45]:
def run_one_fold(train_loader, val_loader, num_epochs, embedding_size):
    model = AudioClassifier(embedding_size)
    model.to(device)
    learning_rate = 1e-3
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train(model, train_loader, criterion, optimizer, num_epochs)
    return eval(model, val_loader)

In [46]:
def get_model_accuracy(embeddings_map_file, train_batch_size, num_epochs, embedding_size = 1024):
    accuracies = []
    for train_loader, val_loader in kfold(embeddings_map_file, train_batch_size):
        accuracies.append(run_one_fold(train_loader, val_loader, num_epochs, embedding_size))
    return sum(accuracies) / len(accuracies)

In [47]:
# Model accuracy for hubert-large-ls960-ft 
get_model_accuracy('../../embeddings/acoustic_features_1.pkl', 2, 10)

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


1.0