In [2]:
import pandas as pd
import pickle
import os
from tqdm import tqdm
from transformers import BertTokenizer
import torch

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('medicalai/ClinicalBERT', do_lower_case=True)

# Load the dataset into a pandas dataframe.
df_test = pd.read_csv("~/med264/Dataset1/day1_30mortality_test.csv", index_col=0)

# Report the number of sentences.
print('Number of testing sentences: {:,}\n'.format(df_test.shape[0]))

# Get the lists of sentences and their labels.
sentences_test = df_test.TEXT.values
labels_test = df_test.Label.values

file_path_test = os.path.expanduser('~/med264/Dataset1/input_ids_test.pickle')

input_ids_test = []

if os.path.exists(file_path_test):
    with open(file_path_test, 'rb') as f:
        input_ids_test = pickle.load(f)
    print('Loaded input_ids_test.')
else:
    for sent in tqdm(sentences_test):
        encoded_sent = tokenizer.encode(sent, add_special_tokens = True)
        input_ids_test.append(encoded_sent)
    with open(file_path_test, 'wb') as f:
            pickle.dump(input_ids_test, f)
    print('Saved input_ids_test.')

print('Max test sentence length: ', max([len(sen) for sen in input_ids_test]))

# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 512

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long",
                            value=0, truncating="post", padding="post")

print('\nDone.')

attention_masks_test = []
for sent in input_ids_test:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks_test.append(att_mask)

test_inputs, test_labels, test_masks = input_ids_test, labels_test, attention_masks_test

# Convert all inputs and labels into torch tensors, the required datatype
# for our model.
test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_masks)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Loading BERT tokenizer...
Number of testing sentences: 14,935

Loaded input_ids_test.
Max test sentence length:  890

Padding/truncating all sentences to 512 values...

Padding token: "[PAD]", ID: 0

Done.


In [3]:
# Save test_inputs, test_labels, test_masks
torch.save(test_inputs, os.path.expanduser('~/med264/Dataset1/test_inputs.pt'))
torch.save(test_labels, os.path.expanduser('~/med264/Dataset1/test_labels.pt'))
torch.save(test_masks, os.path.expanduser('~/med264/Dataset1/test_masks.pt'))