In [7]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, DistilBertModel, DistilBertTokenizer
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import gc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
DECEPTIVE_DIR = '../../Dataset2/Text/Lie/'
deceptive = os.listdir(DECEPTIVE_DIR)
TRUTHFUL_DIR = '../../Dataset2/Text/Truth/'
truthful = os.listdir(TRUTHFUL_DIR)
print('Deceptive size:', len(deceptive))
print('Truthful size:', len(truthful))

Deceptive size: 75
Truthful size: 74


In [35]:
A = []
for file in deceptive:
    with open(DECEPTIVE_DIR + file, 'r') as f:
        text = f.read()
        l = len(text.split())
        A.append((l, file))
for file in truthful:
    with open(TRUTHFUL_DIR + file, 'r') as f:
        text = f.read()
        l = len(text.split())
        A.append((l, file))
print(sorted(A))

[(115, '33-47-982.txt'), (117, '09-18-253.txt'), (124, '22-05-764.txt'), (126, '57-43-236.txt'), (132, '18-42-263.txt'), (143, '38-26-565.txt'), (149, '05-50-814.txt'), (151, '04-30-467.txt'), (162, '15-13-468.txt'), (166, '31-08-51.txt'), (168, '11-17-124.txt'), (180, '17-10-481.txt'), (181, '14-45-849.txt'), (183, '08-33-112.txt'), (190, '27-35-915.txt'), (193, '00-34-288.txt'), (193, '06-18-285.txt'), (194, '02-20-877.txt'), (194, '12-03-237.txt'), (198, '11-22-778.txt'), (200, '35-42-851.txt'), (201, '00-13-953.txt'), (203, '31-59-194.txt'), (208, '09-02-798.txt'), (211, '18-39-581.txt'), (214, '13-27-771.txt'), (216, '37-36-401.txt'), (219, '12-05-243.txt'), (223, '36-53-693.txt'), (227, '50-29-214.txt'), (228, '15-00-532.txt'), (229, '18-48-414.txt'), (230, '06-57-279.txt'), (234, '21-03-94.txt'), (236, '48-39-430.txt'), (239, '29-48-550.txt'), (241, '12-23-235.txt'), (242, '06-44-152.txt'), (244, '27-19-148.txt'), (244, '57-52-839.txt'), (246, '17-15-163.txt'), (249, '09-14-629.

In [3]:
def prepare_text(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return tokenized_text, tokens_tensor, segments_tensors

In [4]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensor)
        if type(model) is DistilBertModel:
            return outputs[0][0, 0, :]
        else:
            hidden_states = outputs[2]
            token_embeddings = (hidden_states[-1][0, 0, :] + hidden_states[-2][0, 0, :] + hidden_states[-3][0, 0, :] + hidden_states[-4][0, 0, :]) / 4
            return token_embeddings

In [5]:
def save_embeddings(bert, tokenizer, output_file):
    embedding_map = {}
    for file in truthful:
        with open(TRUTHFUL_DIR + file, encoding='utf8') as f:
            text = f.read()
            splits = text.split()
            subtexts = []
            i = 0
            while i < len(splits):
                subtexts.append(' '.join(splits[i:i+350]))
                # Allow a 20% overlap between subtexts
                i += 280
            embeddings = []
            for subtext in subtexts:
                tokenized_text, tokens_tensor, segments_tensor = prepare_text(subtext, tokenizer)
                embeddings.append(get_bert_embeddings(tokens_tensor, segments_tensor, bert))
            embedding_map[file.split('.')[0]] = torch.mean(torch.stack(embeddings), dim=0)
    for file in deceptive:
        with open(DECEPTIVE_DIR + file, encoding='utf8') as f:
            text = f.read()
            splits = text.split()
            subtexts = []
            i = 0
            while i < len(splits):
                subtexts.append(' '.join(splits[i:i+350]))
                # Allow a 20% overlap between subtexts
                i += 280
            embeddings = []
            for subtext in subtexts:
                tokenized_text, tokens_tensor, segments_tensor = prepare_text(subtext, tokenizer)
                embeddings.append(get_bert_embeddings(tokens_tensor, segments_tensor, bert))
            embedding_map[file.split('.')[0]] = torch.mean(torch.stack(embeddings), dim=0)
    with open(output_file, 'wb') as f:
        pickle.dump(embedding_map, f)

In [47]:
bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
save_embeddings(bert, tokenizer, '../../embeddings2/transcript_features_1.pkl')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
bert_emotion = AutoModelForSequenceClassification.from_pretrained("ncduy/bert-base-cased-finetuned-emotion", output_hidden_states = True).bert
bert_emotion.eval()
tokenizer = AutoTokenizer.from_pretrained("ncduy/bert-base-cased-finetuned-emotion")
save_embeddings(bert_emotion, tokenizer, '../../embeddings2/transcript_features_2.pkl')

In [9]:
gc.collect()
distil_bert_emotion = AutoModel.from_pretrained("transformersbook/distilbert-base-uncased-finetuned-emotion", output_hidden_states = True)
distil_bert_emotion.eval()
tokenizer = AutoTokenizer.from_pretrained("transformersbook/distilbert-base-uncased-finetuned-emotion")
save_embeddings(distil_bert_emotion, tokenizer, '../../embeddings2/transcript_features_3.pkl')

Some weights of the model checkpoint at transformersbook/distilbert-base-uncased-finetuned-emotion were not used when initializing DistilBertModel: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
gc.collect()
distil_bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
distil_bert_model.eval()
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
save_embeddings(distil_bert_model, tokenizer, '../../embeddings2/transcript_features_4.pkl')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def save_sentence_transformer_embeddings(model, output_file, max_length):
    embedding_map = {}
    for file in truthful:
        with open(TRUTHFUL_DIR + file, encoding='utf8') as f:
            text = f.read()
            splits = text.split()
            subtexts = []
            i = 0
            while i < len(splits):
                subtexts.append(' '.join(splits[i:i+max_length]))
                # Allow a 20% overlap between subtexts
                i += int(max_length*0.8)
            embeddings = []
            for subtext in subtexts:
                embeddings.append(model.encode(subtext))
            embedding_map[file.split('.')[0]] = np.mean(embeddings, axis=0)
    for file in deceptive:
        with open(DECEPTIVE_DIR + file, encoding='utf8') as f:
            text = f.read()
            splits = text.split()
            subtexts = []
            i = 0
            while i < len(splits):
                subtexts.append(' '.join(splits[i:i+max_length]))
                # Allow a 20% overlap between subtexts
                i += int(max_length*0.8)
            embeddings = []
            for subtext in subtexts:
                embeddings.append(model.encode(subtext))
            embedding_map[file.split('.')[0]] = np.mean(embeddings, axis=0)
    with open(output_file, 'wb') as f:
        pickle.dump(embedding_map, f)

In [13]:
miniLM_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
save_sentence_transformer_embeddings(miniLM_model, '../../embeddings2/transcript_features_5.pkl', 160)

In [14]:
mpnet_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
save_sentence_transformer_embeddings(mpnet_model, '../../embeddings2/transcript_features_6.pkl', 70)

In [11]:
gc.collect()
bert = BertModel.from_pretrained('bert-large-uncased', output_hidden_states = True)
bert.eval()
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
save_embeddings(bert, tokenizer, '../../embeddings2/transcript_features_7.pkl')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
