In [42]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, DistilBertModel, DistilBertTokenizer
from sentence_transformers import SentenceTransformer
import numpy as np
import os
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
DECEPTIVE_DIR = '../../data/Transcription/Deceptive/'
deceptive = os.listdir(DECEPTIVE_DIR)
TRUTHFUL_DIR = '../../data/Transcription/Truthful/'
truthful = os.listdir(TRUTHFUL_DIR)
print('Deceptive size:', len(deceptive))
print('Truthful size:', len(truthful))

Deceptive size: 61
Truthful size: 60


In [3]:
def prepare_text(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return tokenized_text, tokens_tensor, segments_tensors

In [4]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensor)
        if type(model) is DistilBertModel:
            return outputs[0][0, 0, :]
        else:
            hidden_states = outputs[2]
            token_embeddings = (hidden_states[-1][0, 0, :] + hidden_states[-2][0, 0, :] + hidden_states[-3][0, 0, :] + hidden_states[-4][0, 0, :]) / 4
            return token_embeddings

In [5]:
def save_embeddings(bert, tokenizer, output_file):
    embedding_map = {}
    for file in truthful:
        with open(TRUTHFUL_DIR + file, encoding='utf8') as f:
            text = f.read()
            tokenized_text, tokens_tensor, segments_tensor = prepare_text(text, tokenizer)
            embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, bert)
            embedding_map[file.split('.')[0]] = embeddings
    for file in deceptive:
        with open(DECEPTIVE_DIR + file, encoding='utf8') as f:
            text = f.read()
            tokenized_text, tokens_tensor, segments_tensor = prepare_text(text, tokenizer)
            embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, bert)
            embedding_map[file.split('.')[0]] = embeddings
    with open(output_file, 'wb') as f:
        pickle.dump(embedding_map, f)

In [85]:
bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
save_embeddings(bert, tokenizer, '../../embeddings/transcript_features_1.pkl')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [86]:
bert_emotion = AutoModelForSequenceClassification.from_pretrained("ncduy/bert-base-cased-finetuned-emotion", output_hidden_states = True).bert
bert_emotion.eval()
tokenizer = AutoTokenizer.from_pretrained("ncduy/bert-base-cased-finetuned-emotion")
save_embeddings(bert_emotion, tokenizer, '../../embeddings/transcript_features_2.pkl')

In [89]:
distil_bert_emotion = AutoModel.from_pretrained("transformersbook/distilbert-base-uncased-finetuned-emotion", output_hidden_states = True)
distil_bert_emotion.eval()
tokenizer = AutoTokenizer.from_pretrained("transformersbook/distilbert-base-uncased-finetuned-emotion")
save_embeddings(distil_bert_emotion, tokenizer, '../../embeddings/transcript_features_3.pkl')

Some weights of the model checkpoint at transformersbook/distilbert-base-uncased-finetuned-emotion were not used when initializing DistilBertModel: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [91]:
distil_bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
distil_bert_model.eval()
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
save_embeddings(distil_bert_model, tokenizer, '../../embeddings/transcript_features_4.pkl')

Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 121kB/s]
Downloading pytorch_model.bin: 100%|██████████| 268M/268M [00:23<00:00, 11.5MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:01<00:00, 222kB/s]
Downloading (…

In [6]:
def save_sentence_transformer_embeddings(model, output_file):
    embedding_map = {}
    for file in truthful:
        with open(TRUTHFUL_DIR + file, encoding='utf8') as f:
            text = f.read()
            embeddings = model.encode(text)
            embedding_map[file.split('.')[0]] = embeddings
    for file in deceptive:
        with open(DECEPTIVE_DIR + file, encoding='utf8') as f:
            text = f.read()
            embeddings = model.encode(text)
            embedding_map[file.split('.')[0]] = embeddings
    with open(output_file, 'wb') as f:
        pickle.dump(embedding_map, f)

In [31]:
miniLM_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
save_sentence_transformer_embeddings(miniLM_model, '../../embeddings/transcript_features_5.pkl')

NameError: name 'save_sentence_transformer_embeddings' is not defined

In [32]:
mpnet_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
save_sentence_transformer_embeddings(mpnet_model, '../../embeddings/transcript_features_6.pkl')

NameError: name 'save_sentence_transformer_embeddings' is not defined

In [59]:
bert = BertModel.from_pretrained('bert-large-uncased', output_hidden_states = True)
bert.eval()
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
save_embeddings(bert, tokenizer, '../../embeddings/transcript_features_7.pkl')

Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 47.5kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [02:01<00:00, 11.0MB/s]
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenc

In [20]:
class LexicalBertClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalBertClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 64)
        self.linear2 = torch.nn.Linear(64, 2)
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [21]:
class LexicalContrastiveClassifier(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalContrastiveClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 256)
        self.linear2 = torch.nn.Linear(256, 64)
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [77]:
class LexicalBertClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalBertClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 16)
        self.linear2 = torch.nn.Linear(16, 2)
        torch.nn.init.kaiming_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
    
    def forward(self, x):
        x = self.linear1(torch.nn.functional.relu(x))
        return self.linear2(torch.nn.functional.relu(x))

In [78]:
class LexicalContrastiveClassifierPCA(torch.nn.Module):
    def __init__(self, embedding_size):
        super(LexicalContrastiveClassifierPCA, self).__init__()
        self.linear1 = torch.nn.Linear(embedding_size, 32)
        torch.nn.init.xavier_uniform_(self.linear1.weight)
    
    def forward(self, x):
        return self.linear1(torch.nn.functional.relu(x))

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
class LexicalBertDataset(Dataset):
    def __init__(self, data, embedding_map_file) -> None:
        self.data = data
        with open(embedding_map_file, 'rb') as f:
            self.embedding_map = pickle.load(f)
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        embeddings = self.embedding_map[self.data[index][0].split('.')[0]]
        return embeddings, self.data[index][1]

In [76]:
class LexicalBertDatasetPCA(Dataset):
    def __init__(self, data, embedding_map_file, pca, sc, test) -> None:
        self.embeddings = []
        self.labels = []
        with open(embedding_map_file, 'rb') as f:
            self.embedding_map = pickle.load(f)
        for file, label in data:
            embedding = self.embedding_map[file.split('.')[0]]
            if type(embedding) == np.ndarray:
                self.embeddings.append(embedding)
            else:
                self.embeddings.append(embedding.numpy())
            self.labels.append(label)
        self.embeddings = np.array(self.embeddings)
        if not test:
            self.embeddings = sc.fit_transform(self.embeddings)
            self.embeddings = pca.fit_transform(self.embeddings)
        else:
            self.embeddings = sc.transform(self.embeddings)
            self.embeddings = pca.transform(self.embeddings)
        self.embeddings = torch.from_numpy(self.embeddings).float()

    
    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, index):
        return self.embeddings[index], self.labels[index]

In [8]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [9]:
def train(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [10]:
def eval(model, val_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [11]:
def contrast(out, y, truth_out, deception_out, margin):
    truth_similarity = torch.nn.functional.cosine_similarity(out, truth_out, dim=1)
    deception_similarity = torch.nn.functional.cosine_similarity(out, deception_out, dim=1)
    loss = torch.sum(y * torch.nn.functional.relu(margin - truth_similarity + deception_similarity) + (1 - y) * torch.nn.functional.relu(margin - deception_similarity + truth_similarity))
    return loss

In [12]:
def train_contrastive(model, train_loader, optimizer, num_epochs, truth_statement, deception_statement, margin):
    for epoch in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            truth_out = model(truth_statement)
            deception_out = model(deception_statement)
            loss = contrast(out, y, truth_out, deception_out, margin)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [13]:
def eval_contrastive(model, val_loader, truth_statement, deception_statement):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            out = model(x)
            truth_out = model(truth_statement)
            deception_out = model(deception_statement)
            truth_similarity = torch.nn.functional.cosine_similarity(out, truth_out, dim=1)
            deception_similarity = torch.nn.functional.cosine_similarity(out, deception_out, dim=1)
            _, predicted = torch.max(torch.stack((truth_similarity, deception_similarity), dim=1).data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return correct / total

In [92]:
def kfold(embeddings_map_file, train_batch_size, do_pca, embedding_size):
    truthful_data = [(x, 0) for x in truthful]
    deceptive_data = [(x, 1) for x in deceptive]
    for i in range(10):
        val_data = truthful_data[i*6:(i+1)*6] + deceptive_data[i*6:(i+1)*6]
        train_data = truthful_data[:i*6] + truthful_data[(i+1)*6:] + deceptive_data[:i*6] + deceptive_data[(i+1)*6:]
        if not do_pca:
            train_dataset = LexicalBertDataset(train_data, embeddings_map_file)
            val_dataset = LexicalBertDataset(val_data, embeddings_map_file)
            train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=12, shuffle=True)
            yield train_loader, val_loader, None
        else:
            sc = StandardScaler()
            pca = PCA(n_components=embedding_size)
            train_dataset = LexicalBertDatasetPCA(train_data, embeddings_map_file, pca, sc, False)
            val_dataset = LexicalBertDatasetPCA(val_data, embeddings_map_file, pca, sc, True)
            train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=12, shuffle=True)
            yield train_loader, val_loader, (pca, sc)

In [93]:
def run_one_fold(train_loader, val_loader, num_epochs, embedding_size, contrastive, truth_statement, deception_statement, margin, do_pca):
    if not contrastive:
        if not do_pca:
            model = LexicalBertClassifier(embedding_size)
        else:
            model = LexicalBertClassifierPCA(embedding_size)
        model.to(device)
        learning_rate = 1e-3
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        train(model, train_loader, criterion, optimizer, num_epochs)
        return eval(model, val_loader)
    if contrastive:
        if not do_pca:
            model = LexicalContrastiveClassifier(embedding_size)
        else:
            model = LexicalContrastiveClassifierPCA(embedding_size)
        model.to(device)
        learning_rate = 1e-3
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        train_contrastive(model, train_loader, optimizer, num_epochs, truth_statement, deception_statement, margin)
        return eval_contrastive(model, val_loader, truth_statement, deception_statement)

In [109]:
def get_model_accuracy(embeddings_map_file, train_batch_size, num_epochs, embedding_size = 768, contrastive = False, truth_statement = None, deception_statement = None, margin = 0, do_pca = False):
    accuracies = []
    for train_loader, val_loader, args in kfold(embeddings_map_file, train_batch_size, do_pca, embedding_size):
        if do_pca and contrastive:
            pca, sc = args
            truth_statement_copy = torch.from_numpy(pca.transform(sc.transform([truth_statement.cpu().numpy()]))[0]).float().to(device)
            deception_statement_copy = torch.from_numpy(pca.transform(sc.transform([deception_statement.cpu().numpy()]))[0]).float().to(device)
        else:
            truth_statement_copy = truth_statement
            deception_statement_copy = deception_statement
        accuracies.append(run_one_fold(train_loader, val_loader, num_epochs, embedding_size, contrastive, truth_statement_copy, deception_statement_copy, margin, do_pca))
    print(accuracies)
    return sum(accuracies) / len(accuracies)

In [24]:
# Model accuracy for bert-base-uncased
print(get_model_accuracy('../../embeddings/transcript_features_1.pkl', train_batch_size = 4, num_epochs = 40))

[0.6666666666666666, 0.6666666666666666, 0.5, 0.5833333333333334, 0.6666666666666666, 0.3333333333333333, 0.5833333333333334, 0.75, 0.6666666666666666, 0.6666666666666666]
0.6083333333333334


In [25]:
# Model accuracy for bert-base-cased-finetuned-emotion
print(get_model_accuracy('../../embeddings/transcript_features_2.pkl', train_batch_size = 4, num_epochs = 40))

[0.6666666666666666, 0.5833333333333334, 0.25, 0.8333333333333334, 0.6666666666666666, 0.5, 0.6666666666666666, 0.75, 0.75, 0.5833333333333334]
0.625


In [26]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion
print(get_model_accuracy('../../embeddings/transcript_features_3.pkl', train_batch_size = 4, num_epochs = 40))

[0.5, 0.5, 0.5833333333333334, 0.75, 0.75, 0.75, 0.5, 0.5833333333333334, 0.75, 0.3333333333333333]
0.6


In [27]:
# Model accuracy for distil-bert-base-uncased
print(get_model_accuracy('../../embeddings/transcript_features_4.pkl', train_batch_size = 4, num_epochs = 40))

[0.3333333333333333, 0.5, 0.3333333333333333, 0.5833333333333334, 0.5833333333333334, 0.5, 0.5833333333333334, 0.75, 0.75, 0.4166666666666667]
0.5333333333333334


In [28]:
# Model accuracy for sentence-transformer/all-MiniLM-L6-v2
print(get_model_accuracy('../../embeddings/transcript_features_5.pkl', train_batch_size = 4, num_epochs = 40, embedding_size = 384))

[0.5, 0.5, 0.25, 0.75, 0.5833333333333334, 0.5833333333333334, 0.3333333333333333, 0.6666666666666666, 0.5833333333333334, 0.5]
0.525


In [29]:
# Model accuracy for sentence-transformer/all-mpnet-base-v2
print(get_model_accuracy('../../embeddings/transcript_features_6.pkl', train_batch_size = 4, num_epochs = 40))

[0.5833333333333334, 0.5, 0.25, 0.8333333333333334, 0.5833333333333334, 0.5, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.4166666666666667]
0.5666666666666668


In [33]:
# Cosine similarity check for miniLM
with open('../../embeddings/transcript_features_5.pkl', 'rb') as f:
    embedding_map = pickle.load(f)
truth_statement = miniLM_model.encode('This is the truth')
deception_statement = miniLM_model.encode('This is a lie')
count = 0
for i in truthful:
    if cosine_similarity(embedding_map[i.split('.')[0]], truth_statement) > cosine_similarity(embedding_map[i.split('.')[0]], deception_statement):
        count += 1
for i in deceptive:
    if cosine_similarity(embedding_map[i.split('.')[0]], deception_statement) > cosine_similarity(embedding_map[i.split('.')[0]], truth_statement):
        count += 1
print(count / (len(truthful) + len(deceptive)))

0.5206611570247934


In [34]:
# Cosine similarity check for mpNet
with open('../../embeddings/transcript_features_6.pkl', 'rb') as f:
    embedding_map = pickle.load(f)
truth_statement = mpnet_model.encode('This is the truth')
deception_statement = mpnet_model.encode('This is a lie')
count = 0
for i in truthful:
    if cosine_similarity(embedding_map[i.split('.')[0]], truth_statement) > cosine_similarity(embedding_map[i.split('.')[0]], deception_statement):
        count += 1
for i in deceptive:
    if cosine_similarity(embedding_map[i.split('.')[0]], deception_statement) > cosine_similarity(embedding_map[i.split('.')[0]], truth_statement):
        count += 1
print(count / (len(truthful) + len(deceptive)))

0.5289256198347108


In [35]:
# Contrastive model accuracy for miniLM
print(get_model_accuracy('../../embeddings/transcript_features_5.pkl', train_batch_size = 4, num_epochs = 40, embedding_size = 384, contrastive = True, truth_statement = torch.tensor(miniLM_model.encode('This is the truth')).to(device), deception_statement = torch.tensor(miniLM_model.encode('This is a lie')).to(device), margin = 0.05))

[0.5, 0.75, 0.5833333333333334, 0.3333333333333333, 0.5, 0.3333333333333333, 0.5, 0.25, 0.4166666666666667, 0.5]
0.4666666666666667


In [36]:
# Contrasting model accuracy for mpNet
print(get_model_accuracy('../../embeddings/transcript_features_6.pkl', train_batch_size = 4, num_epochs = 40, contrastive = True, truth_statement = torch.tensor(mpnet_model.encode('This is the truth')).to(device), deception_statement = torch.tensor(mpnet_model.encode('This is a lie')).to(device), margin = 0.05))

[0.4166666666666667, 0.4166666666666667, 0.6666666666666666, 0.25, 0.4166666666666667, 0.3333333333333333, 0.3333333333333333, 0.5, 0.4166666666666667, 0.75]
0.45


In [37]:
# Model accuracy for bert-large-uncased
print(get_model_accuracy('../../embeddings/transcript_features_7.pkl', train_batch_size = 4, num_epochs = 40, embedding_size = 1024))

[0.5, 0.5833333333333334, 0.5, 0.6666666666666666, 0.5, 0.4166666666666667, 0.5, 0.5, 0.6666666666666666, 0.5833333333333334]
0.5416666666666666


In [83]:
# Model accuracy for bert-base-uncased with pca
print(get_model_accuracy('../../embeddings/transcript_features_1.pkl', train_batch_size = 2, num_epochs = 20, embedding_size = 64, do_pca = True))

[0.5833333333333334, 0.75, 0.4166666666666667, 0.6666666666666666, 0.6666666666666666, 0.4166666666666667, 0.5833333333333334, 0.6666666666666666, 0.75, 0.5833333333333334]
0.6083333333333333


In [84]:
# Model accuracy for bert-base-cased-finetuned-emotion with pca
print(get_model_accuracy('../../embeddings/transcript_features_2.pkl', train_batch_size = 2, num_epochs = 20, embedding_size = 64, do_pca = True))

[0.5833333333333334, 0.4166666666666667, 0.5, 0.6666666666666666, 0.75, 0.4166666666666667, 0.5833333333333334, 0.5833333333333334, 0.4166666666666667, 0.5833333333333334]
0.55


In [85]:
# Model accuracy for transformersbook/distilbert-base-uncased-finetuned-emotion with pca
print(get_model_accuracy('../../embeddings/transcript_features_3.pkl', train_batch_size = 2, num_epochs = 20, embedding_size=64, do_pca = True))

[0.5833333333333334, 0.5, 0.5833333333333334, 0.6666666666666666, 0.5, 0.5833333333333334, 0.5, 0.5833333333333334, 0.5833333333333334, 0.5833333333333334]
0.5666666666666667


In [86]:
# Model accuracy for distil-bert-base-uncased with pca
print(get_model_accuracy('../../embeddings/transcript_features_4.pkl', train_batch_size = 2, num_epochs = 20, embedding_size=100, do_pca = True))

[0.5, 0.9166666666666666, 0.25, 0.4166666666666667, 0.5, 0.5, 0.75, 0.75, 0.5833333333333334, 0.8333333333333334]
0.5999999999999999


In [87]:
# Model accuracy for sentence-transformer/all-MiniLM-L6-v2 with pca
print(get_model_accuracy('../../embeddings/transcript_features_5.pkl', train_batch_size = 2, num_epochs = 20, embedding_size = 100, do_pca=True))

[0.25, 0.5, 0.5833333333333334, 0.6666666666666666, 0.5, 0.5, 0.5, 0.5, 0.75, 0.4166666666666667]
0.5166666666666667


In [88]:
# Model accuracy for sentence-transformer/all-mpnet-base-v2 with pca
print(get_model_accuracy('../../embeddings/transcript_features_6.pkl', train_batch_size = 2, num_epochs = 20, embedding_size=100, do_pca = True))

[0.5833333333333334, 0.5833333333333334, 0.5833333333333334, 0.8333333333333334, 0.6666666666666666, 0.75, 0.4166666666666667, 0.75, 0.5, 0.5]
0.6166666666666667


In [106]:
# Contrastive model accuracy for miniLM with pca
print(get_model_accuracy('../../embeddings/transcript_features_5.pkl', train_batch_size = 4, num_epochs = 40, embedding_size = 64, contrastive = True, truth_statement = torch.tensor(miniLM_model.encode('This is the truth')).to(device), deception_statement = torch.tensor(miniLM_model.encode('This is a lie')).to(device), margin = 0.05, do_pca=True))

[0.5, 0.6666666666666666, 0.5833333333333334, 0.4166666666666667, 0.4166666666666667, 0.4166666666666667, 0.5, 0.3333333333333333, 0.25, 0.25]
0.4333333333333333


In [107]:
# Contrasting model accuracy for mpNet with pca
print(get_model_accuracy('../../embeddings/transcript_features_6.pkl', train_batch_size = 4, num_epochs = 40, embedding_size = 64, contrastive = True, truth_statement = torch.tensor(mpnet_model.encode('This is the truth')).to(device), deception_statement = torch.tensor(mpnet_model.encode('This is a lie')).to(device), margin = 0.05, do_pca=True))

[0.5, 0.8333333333333334, 0.5, 0.25, 0.5833333333333334, 0.5, 0.5, 0.3333333333333333, 0.16666666666666666, 0.25]
0.4416666666666667


In [110]:
# Model accuracy for bert-large-uncased with pca
print(get_model_accuracy('../../embeddings/transcript_features_7.pkl', train_batch_size = 4, num_epochs = 40, embedding_size = 64, do_pca = True))

[0.5, 0.8333333333333334, 0.3333333333333333, 0.6666666666666666, 0.6666666666666666, 0.4166666666666667, 0.5, 0.5833333333333334, 0.3333333333333333, 0.5]
0.5333333333333333
