In [6]:
import pickle
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [13]:
DECEPTIVE_DIR = '../../data/Transcription/Deceptive/'
deceptive = os.listdir(DECEPTIVE_DIR)
TRUTHFUL_DIR = '../../data/Transcription/Truthful/'
truthful = os.listdir(TRUTHFUL_DIR)
print('Deceptive size:', len(deceptive))
print('Truthful size:', len(truthful))

Deceptive size: 61
Truthful size: 60


In [37]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
def prepare_text(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return tokenized_text, tokens_tensor, segments_tensors

In [56]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensor)
        hidden_states = outputs[2]
    token_embeddings = hidden_states[-2][0]
    return torch.mean(token_embeddings, dim=0)

In [59]:
embedding_map = {}
for file in truthful:
    with open(TRUTHFUL_DIR + file, encoding='utf8') as f:
        text = f.read()
        tokenized_text, tokens_tensor, segments_tensor = prepare_text(text, tokenizer)
        embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, model)
        embedding_map[file.split('.')[0]] = embeddings
for file in deceptive:
    with open(DECEPTIVE_DIR + file, encoding='utf8') as f:
        text = f.read()
        tokenized_text, tokens_tensor, segments_tensor = prepare_text(text, tokenizer)
        embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, model)
        embedding_map[file.split('.')[0]] = embeddings
with open('../../embeddings/transcript_features_1.pkl', 'wb') as f:
    pickle.dump(embedding_map, f)