In [1]:
import torch
import os
import pickle
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
class AcousticAndVideoClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AcousticAndVideoClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [3]:
class AcousticAndVideoDataset(Dataset):
    def __init__(self, acoustic_data, video_data, labels, acoustic_file, video_file, pca_a, sc_a, pca_v, sc_v, test):
        with open(acoustic_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.acoustic_embeddings = np.array([embeddings_map[file] for file in acoustic_data])
        with open(video_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.video_embeddings = np.array([embeddings_map[file.split('.')[0]].numpy() for file in video_data])
        if not test:
            self.acoustic_embeddings = sc_a.fit_transform(self.acoustic_embeddings)
            self.acoustic_embeddings = pca_a.fit_transform(self.acoustic_embeddings)
            self.video_embeddings = sc_v.fit_transform(self.video_embeddings)
            self.video_embeddings = pca_v.fit_transform(self.video_embeddings)
        else:
            self.acoustic_embeddings = sc_a.transform(self.acoustic_embeddings)
            self.acoustic_embeddings = pca_a.transform(self.acoustic_embeddings)
            self.video_embeddings = sc_v.transform(self.video_embeddings)
            self.video_embeddings = pca_v.transform(self.video_embeddings)
        self.acoustic_embeddings = torch.from_numpy(self.acoustic_embeddings).float()
        self.video_embeddings = torch.from_numpy(self.video_embeddings).float()
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return torch.concat((self.acoustic_embeddings[index], self.video_embeddings[index])), self.labels[index]

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def train(model, train_loader, criterion, optimizer, num_epochs):
    for _ in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [6]:
def eval(model, val_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [7]:
train_acoustic_data = []
train_labels = []
for file in os.listdir('../../data/Acoustic/Truthful/'):
    train_acoustic_data.append(file)
    train_labels.append(0)
for file in os.listdir('../../data/Acoustic/Deceptive/'):
    train_acoustic_data.append(file)
    train_labels.append(1)

test_acoustic_data = []
test_labels = []
for file in os.listdir('../../Dataset2/Audio/Truth/'):
    key = file.split('.')[0]
    key = '-'.join(key.split('-')[:3])
    test_acoustic_data.append(key)
    test_labels.append(0)
for file in os.listdir('../../Dataset2/Audio/Lies/'):
    key = file.split('.')[0]
    key = '-'.join(key.split('-')[:3])
    test_acoustic_data.append(key)
    test_labels.append(1)

train_video_data = []
for file in os.listdir('../../data/Clips/Truthful/'):
    train_video_data.append(file.split('.')[0])
for file in os.listdir('../../data/Clips/Deceptive/'):
    train_video_data.append(file.split('.')[0])

test_video_data = []
for file in os.listdir('../../Dataset2/Videos/Truth/'):
    key = file.split('.')[0]
    key = '-'.join(key.split('-')[:3])
    test_video_data.append(key)
for file in os.listdir('../../Dataset2/Videos/Lie/'):
    key = file.split('.')[0]
    key = '-'.join(key.split('-')[:3])
    test_video_data.append(key)


In [8]:
def get_accuracy(train_acoustic_file, train_video_file, test_acoustic_file, test_video_file, embedding_size, train_batch_size, num_epochs):
    sc_v = StandardScaler()
    pca_v = PCA(n_components=embedding_size // 2)
    sc_a = StandardScaler()
    pca_a = PCA(n_components=embedding_size // 2)
    train_dataset = AcousticAndVideoDataset(train_acoustic_data, train_video_data, train_labels, train_acoustic_file, train_video_file, pca_a, sc_a, pca_v, sc_v, False)
    test_dataset = AcousticAndVideoDataset(test_acoustic_data, test_video_data, test_labels, test_acoustic_file, test_video_file, pca_a, sc_a, pca_v, sc_v, True)
    model = AcousticAndVideoClassifier(embedding_size).to(device)
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    learning_rate = 1e-3
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train(model, train_loader, criterion, optimizer, num_epochs)
    return eval(model, test_loader)

In [18]:
# Model accuracy for vggish and resnet-3d-18
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', '../../embeddings2/acoustic_features_2.pkl', '../../embeddings2/visual_features.pkl', 64, 4, 20)

0.4899328859060403

In [21]:
# Model accuracy for vggish and resnet-3d-18
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', '../../embeddings2/acoustic_features_2.pkl', '../../embeddings2/visual_features.pkl', 96, 4, 20)

0.5100671140939598

In [9]:
# Model accuracy for vggish and deep id face
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features_2.pkl', '../../embeddings2/acoustic_features_2.pkl', '../../embeddings2/visual_features_2.pkl', 96, 4, 20)

0.5436241610738255

In [10]:
# Model accuracy for vggish and facenet
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features_3.pkl', '../../embeddings2/acoustic_features_2.pkl', '../../embeddings2/visual_features_3.pkl', 96, 4, 20)

0.5302013422818792

In [12]:
# Model accuracy for vggish and openface
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features_4.pkl', '../../embeddings2/acoustic_features_2.pkl', '../../embeddings2/visual_features_4.pkl', 96, 4, 20)

0.5570469798657718

In [13]:
# Model accuracy for vggish and vggface
get_accuracy('../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features_5.pkl', '../../embeddings2/acoustic_features_2.pkl', '../../embeddings2/visual_features_5.pkl', 96, 4, 20)

0.5369127516778524