In [1]:
import torch
import os
import pickle
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
TRUTHFUL_VIDEO_PATH = '../../data/Clips/Truthful/'
DECEPTIVE_VIDEO_PATH = '../../data/Clips/Deceptive/'
TRUTHFUL_VIDEO_FILES = os.listdir(TRUTHFUL_VIDEO_PATH)
DECEPTIVE_VIDEO_FILES = os.listdir(DECEPTIVE_VIDEO_PATH)

In [3]:
DECEPTIVE_LEXICAL_PATH = '../../data/Transcription/Deceptive/'
DECEPTIVE_LEXICAL_FILES = os.listdir(DECEPTIVE_LEXICAL_PATH)
TRUTHFUL_LEXICAL_PATH = '../../data/Transcription/Truthful/'
TRUTHFUL_LEXICAL_FILES = os.listdir(TRUTHFUL_LEXICAL_PATH)

In [4]:
TRUTHFUL_ACOUSTIC_PATH = '../../data/Acoustic/Truthful/'
DECEPTIVE_ACOUSTIC_PATH = '../../data/Acoustic/Deceptive/'
TRUTHFUL_ACOUSTIC_FILES = os.listdir(TRUTHFUL_ACOUSTIC_PATH)
DECEPTIVE_ACOUSTIC_FILES = os.listdir(DECEPTIVE_ACOUSTIC_PATH)

In [5]:
class LexicalAcousticVideoClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalAcousticVideoClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [6]:
class AudioClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(AudioClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        # Xavier initialize the linear layer
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [7]:
class LexicalBertClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalBertClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [8]:
class VisualClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(VisualClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        # Xavier initialize the linear layer
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [9]:
class LexicalAcousticVideoLateFusion(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalAcousticVideoLateFusion, self).__init__()
        self.acoustic = AudioClassifierPCA(embedding_size // 3)
        self.lexical = LexicalBertClassifierPCA(embedding_size // 3)
        self.visual = VisualClassifierPCA(embedding_size // 3)
    
    def forward(self, x):
        # Split x into 3 parts
        x = torch.split(x, x.shape[1] // 3, dim=1)
        return (self.lexical(x[0]), self.acoustic(x[1]), self.visual(x[2]))

In [10]:
class LexicalAcousticVideoDataset(Dataset):
    def __init__(self, data, files, pca, test):
        lexical_data, acoustic_data, video_data = data
        lexical_file, acoustic_file, video_file = files
        pca_l, sc_l, pca_a, sc_a, pca_v, sc_v = pca
        self.lexical_embeddings = []
        with open(lexical_file, 'rb') as f:
            self.embedding_map = pickle.load(f)
        for file in lexical_data:
            embedding = self.embedding_map[file.split('.')[0]]
            if type(embedding) == np.ndarray:
                self.lexical_embeddings.append(embedding)
            else:
                self.lexical_embeddings.append(embedding.numpy())
        self.lexical_embeddings = np.array(self.lexical_embeddings)
        with open(video_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.video_embeddings = np.array([embeddings_map[file.split('.')[0]].numpy() for file in video_data])
        with open(acoustic_file, 'rb') as f:
            embeddings_map = pickle.load(f)
        self.acoustic_embeddings = np.array([embeddings_map[file] for file in acoustic_data])
        if not test:
            self.lexical_embeddings = sc_l.fit_transform(self.lexical_embeddings)
            self.lexical_embeddings = pca_l.fit_transform(self.lexical_embeddings)
            self.acoustic_embeddings = sc_a.fit_transform(self.acoustic_embeddings)
            self.acoustic_embeddings = pca_a.fit_transform(self.acoustic_embeddings)
            self.video_embeddings = sc_v.fit_transform(self.video_embeddings)
            self.video_embeddings = pca_v.fit_transform(self.video_embeddings)
        else:
            self.lexical_embeddings = sc_l.transform(self.lexical_embeddings)
            self.lexical_embeddings = pca_l.transform(self.lexical_embeddings)
            self.acoustic_embeddings = sc_a.transform(self.acoustic_embeddings)
            self.acoustic_embeddings = pca_a.transform(self.acoustic_embeddings)
            self.video_embeddings = sc_v.transform(self.video_embeddings)
            self.video_embeddings = pca_v.transform(self.video_embeddings)
        self.lexical_embeddings = torch.from_numpy(self.lexical_embeddings).float()
        self.acoustic_embeddings = torch.from_numpy(self.acoustic_embeddings).float()
        self.video_embeddings = torch.from_numpy(self.video_embeddings).float()
        self.labels = [0 if file.split('_')[1] == 'truth' else 1 for file in video_data]
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return torch.concat((self.lexical_embeddings[index], self.acoustic_embeddings[index], self.video_embeddings[index])), self.labels[index]

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
def train(model, train_loader, criterion, optimizer, num_epochs, late_fusion):
    for _ in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            if late_fusion:
                loss = criterion(out[0], y) + criterion(out[1], y) + criterion(out[2], y)
            else:
                loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [13]:
def eval(model, val_loader, late_fusion):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            if not late_fusion:
                _, predicted = torch.max(out.data, 1)
            else:
                _, p0 = torch.max(out[0].data, 1)
                _, p1 = torch.max(out[1].data, 1)
                _, p2 = torch.max(out[2].data, 1)
                predicted = p0
                predicted[p1 == p2] = p1[p1 == p2]
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [14]:
def kfold(lexical_file, acoustic_file, video_file, train_batch_size, embedding_size):
    truthful_acoustic_data = list(TRUTHFUL_ACOUSTIC_FILES)
    deceptive_acoustic_data = list(DECEPTIVE_ACOUSTIC_FILES)
    truthful_video_data = list(TRUTHFUL_VIDEO_FILES)
    deceptive_video_data = list(DECEPTIVE_VIDEO_FILES)
    truthful_lexical_data = list(TRUTHFUL_LEXICAL_FILES)
    deceptive_lexical_data = list(DECEPTIVE_LEXICAL_FILES)
    for i in range(10):
        val_acoustic_data = truthful_acoustic_data[i*6:(i+1)*6] + deceptive_acoustic_data[i*6:(i+1)*6]
        train_acoustic_data = truthful_acoustic_data[:i*6] + truthful_acoustic_data[(i+1)*6:] + deceptive_acoustic_data[:i*6] + deceptive_acoustic_data[(i+1)*6:]
        val_video_data = truthful_video_data[i*6:(i+1)*6] + deceptive_video_data[i*6:(i+1)*6]
        train_video_data = truthful_video_data[:i*6] + truthful_video_data[(i+1)*6:] + deceptive_video_data[:i*6] + deceptive_video_data[(i+1)*6:]
        val_lexical_data = truthful_lexical_data[i*6:(i+1)*6] + deceptive_lexical_data[i*6:(i+1)*6]
        train_lexical_data = truthful_lexical_data[:i*6] + truthful_lexical_data[(i+1)*6:] + deceptive_lexical_data[:i*6] + deceptive_lexical_data[(i+1)*6:]
        sc_a = StandardScaler()
        pca_a = PCA(n_components=embedding_size // 3)
        sc_v = StandardScaler()
        pca_v = PCA(n_components=embedding_size // 3)
        sc_l = StandardScaler()
        pca_l = PCA(n_components=embedding_size // 3)
        train_dataset = LexicalAcousticVideoDataset((train_lexical_data, train_acoustic_data, train_video_data), (lexical_file, acoustic_file, video_file), (pca_l, sc_l, pca_a, sc_a, pca_v, sc_v), False)
        val_dataset = LexicalAcousticVideoDataset((val_lexical_data, val_acoustic_data, val_video_data), (lexical_file, acoustic_file, video_file), (pca_l, sc_l, pca_a, sc_a, pca_v, sc_v), True)
        train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=12)
        yield train_loader, val_loader

In [15]:
def run_one_fold(train_loader, val_loader, num_epochs, embedding_size, late_fusion):
    if not late_fusion:
        model = LexicalAcousticVideoClassifier(embedding_size)
    else:
        model = LexicalAcousticVideoLateFusion(embedding_size)
    model.to(device)
    learning_rate = 1e-3
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train(model, train_loader, criterion, optimizer, num_epochs, late_fusion)
    return eval(model, val_loader, late_fusion)

In [16]:
def get_model_accuracy(lexical_file, acoustic_file, video_file, train_batch_size, num_epochs, embedding_size = 96, late_fusion = False):
    accuracies = []
    for train_loader, val_loader in kfold(lexical_file, acoustic_file, video_file, train_batch_size, embedding_size):
        accuracies.append(run_one_fold(train_loader, val_loader, num_epochs, embedding_size, late_fusion))
    return sum(accuracies) / len(accuracies)

In [22]:
# Model accuracy for bert-base-uncased, hubert-large-ls960-ft and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_1.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.55

In [23]:
# Model accuracy for bert-base-cased-finetuned-emotion and hubert-large-ls960-ft and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_2.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.5583333333333333

In [24]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion and hubert-large-ls960-ft and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_3.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.6

In [25]:
# Model accuracy for mpNet and hubert-large-ls960-ft and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_6.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.5666666666666667

In [26]:
# Model accuracy for bert-base-uncased, hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_1.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.6249999999999999

In [27]:
# Model accuracy for bert-base-uncased, hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_1.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.6166666666666666

In [28]:
# Model accuracy for bert-base-cased-finetuned-emotion and hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_2.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.55

In [29]:
# Model accuracy for bert-base-cased-finetuned-emotion and hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_2.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.5416666666666666

In [30]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion and hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_3.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.6416666666666666

In [31]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion and hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_3.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.6583333333333333

In [32]:
# Model accuracy for mpNet and hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_6.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.7083333333333334

In [33]:
# Model accuracy for mpNet and hubert-large-ls960-ft and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_6.pkl', '../../embeddings/acoustic_features_1.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.65

In [34]:
# Model accuracy for bert-base-uncased, vggish and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_1.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.6416666666666667

In [35]:
# Model accuracy for bert-base-cased-finetuned-emotion and vggish and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_2.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.5750000000000001

In [19]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion and vggish and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_3.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.6333333333333333

In [20]:
# Model accuracy for mpNet and vggish and resnet-3d-18
get_model_accuracy('../../embeddings/transcript_features_6.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96)

0.6

In [21]:
# Model accuracy for bert-base-uncased, vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_1.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.65

In [36]:
# Model accuracy for bert-base-uncased, vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_1.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.5916666666666666

In [37]:
# Model accuracy for bert-base-cased-finetuned-emotion and vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_2.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.625

In [38]:
# Model accuracy for bert-base-cased-finetuned-emotion and vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_2.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.6333333333333334

In [39]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion and vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_3.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.5916666666666666

In [40]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion and vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_3.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.5999999999999999

In [41]:
# Model accuracy for mpNet and vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_6.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 96, True)

0.6833333333333333

In [42]:
# Model accuracy for mpNet and vggish and resnet-3d-18 Late Fusion
get_model_accuracy('../../embeddings/transcript_features_6.pkl', '../../embeddings/acoustic_features_2.pkl', '../../embeddings/visual_features.pkl', 4, 20, 192, True)

0.575