### BERT for Dialog experiments
- We used Google Colaboratory for free GPU access.

In [1]:
HOME_PATH = '/content/gdrive/My Drive/txtLAB-2020/bert-dialog/'
import os
os.listdir(HOME_PATH)

['data', 'dialog-mys-data-booknlp-output', 'dialog-non-data-booknlp-output']

In [2]:
import csv
import numpy as np
import pandas as pd
from collections import Counter

DATA_PATH = HOME_PATH + 'data/'
FIC_BOOKNLP_PATH = HOME_PATH + 'dialog-mys-data-booknlp-output/'
NON_BOOKNLP_PATH = HOME_PATH + 'dialog-non-data-booknlp-output/'


def get_passage(df, start, N=500):
    """
    Given the of the BookNLP output DataFrame & index of the starting word (start), this function 
    returns a 500-word passage starting at index 'start'. Also returns % of words in dialogue.
    """
    id_list = list((range(start, start+N)))
#     print("Get 500 words from index {} to {}".format(id_list[0], id_list[-1]))
    df = df.loc[df['tokenId'].isin(id_list)]
    words = df['originalWord'].tolist()
    # if len(words) != N:
    #     print("Word-count is:", len(words))
    
    quoted_words = df.loc[df['inQuotation']=='I-QUOTE']['originalWord'].tolist() # filter again incase BookNLP missed any
#     print("Quoted:", len(quoted_words), quoted_words[:4])
    return ' '.join(words), len(quoted_words)
    
    
def sample_random_text(fname, two_passages=False):
    """
    Returns a random 500-word passage from the middle of the given volume (used for non-fiction).
    """
    N_WORDS = 500; pct = 0.3
    df = pd.read_csv(fname, delimiter='\t', quoting=csv.QUOTE_NONE) # no quotechar
    df.dropna(subset=['originalWord'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    total_words = df.shape[0]
    first = int(pct*total_words)
    last = int(total_words - pct*total_words)
    
    if two_passages:
#         print("Sample 2 random passages from", fname)
        start_index = first
        text1, _ = get_passage(df, start_index)
        text2, _ = get_passage(df, start_index+N_WORDS)
        return [text1, text2]
        
    else:
#         print("Sample 1 passage from", fname)
        start_index = first
        text, _ = get_passage(df, start_index)
        return text
    

def sample_texts(fname, dialog=True):
    """
    If dialog is False, returns two 500-word passages with zero dialogue.
    
    If dialog is True, samples all possible 500 word passages from the given novel (30% text from either side is skipped)
    And returns the top two samples with most dialogue along with the % dialog in those two texts.
    """
    N_WORDS = 500; pct = 0.3
    df = pd.read_csv(fname, delimiter='\t', quoting=csv.QUOTE_NONE) # no quotechar
    df.dropna(subset=['originalWord'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    total_words = df.shape[0]
    inside_quotes = df['inQuotation'].value_counts().to_dict()["I-QUOTE"]
    first = int(pct*total_words)
    last = int(total_words - pct*total_words)
    
#     print("Total words in the volume: {} | Words inside quotes (BookNLP): {}".format(total_words, inside_quotes))
#     print("Sample random passages from word index {} to {}".format(first, last))

    map_i_tup = {} # maps run number 'i' to a tuple of (quoted words & corresponding text)
    non_dialog_texts = []
    start_index = first
    
    i = 0
    while True:
        if start_index + N_WORDS >= last:
            break
            
        text, quoted = get_passage(df, start_index)
        start_index += N_WORDS
        
        if len(non_dialog_texts) < 2 and quoted == 0:
            non_dialog_texts.append(text)
            
        map_i_tup[i] = (quoted, text)
        i += 1
        
    if not dialog:
        return non_dialog_texts
        
    if dialog:
        sorted_keys = sorted(map_i_tup, key=lambda k: map_i_tup[k][0])
        dialog_texts, pct_quoted = [], []
        for key in sorted_keys[-2:]:
            if map_i_tup[key][0] == 0: # did not have quoted words, return -1
#                print("OOOOPS::::: ID number:", key, "has quoted words:", map_i_tup[key][0])                
                return -1

#             print("ID number:", key, "has quoted words:", map_i_tup[key][0])
            dialog_texts.append(map_i_tup[key][1])
            pct_quoted.append(map_i_tup[key][0]/N_WORDS)

        return dialog_texts, pct_quoted

    
#### Train-Data ####

def load_train_fnames():
    """
    Returns a list of filenames to be used for train-data.
    """
    fiction_fnames = [DATA_PATH+'Train/NovelEnglish_Mystery/'+fname for fname in os.listdir(DATA_PATH+'Train/NovelEnglish_Mystery/')]
    non_fiction_fnames = [DATA_PATH+'Train/NonNovel_English_Contemporary_Mixed/'+fname for fname in os.listdir(DATA_PATH+'Train/NonNovel_English_Contemporary_Mixed/')]
    print("Train Fiction fnames:", len(fiction_fnames), "| Train Non-Fiction fnames:", len(non_fiction_fnames))
    return fiction_fnames, non_fiction_fnames


def load_train_data(dial, no_dial, return_ids=True, N_WORDS=500):
    """
    Returns X and Y for training (len=400), given the experiment and the scenario. Also returns the IDs if flag is set to True.
    Training = 200 fic / 200 nonfic 
    
    The 200 fiction volumes has dialogue/no-dialogue distributions as specified by 'dial' & 'no_dial'.
    dial represents the percent of the fiction-train-set that should have dialog and no_dial represents without-dialog.
    They should add up to 1.
    
    The 200 nonfic has random 500-word passasges from the non-fiction volumes.
    
    2 passages per volume.
    """
    fiction_fnames, non_fiction_fnames = load_train_fnames()
    assert len(fiction_fnames) > len(non_fiction_fnames) == 100
    
    assert dial + no_dial == 1
    
    X, Y, IDs = [], [], [] # corresponding list of texts, labels, and unique IDs

    with_dial, without_dial = 0, 0 # counters
    pct_quoted_fic = [] # keep track of how much "dialog" we have in our dialog data
    
    for fname in fiction_fnames:
        if with_dial == dial*200 and without_dial == no_dial*200:
            break
        
        fname = FIC_BOOKNLP_PATH + fname.split('/')[-1] + '/' + fname.split('/')[-1]
        if not os.path.isfile(fname+'.tokens'):
            print(fname, "doesn't exist. Skip!")
            continue
        
        if without_dial < no_dial*200: # look for passages without-dialog
            try:
                ret = sample_texts(fname+'.tokens', dialog=False)
                assert len(ret) == 2
                X.extend(ret)
                without_dial += 2
                IDs.append("ficNoDialog1____" + fname.split('/')[-1])
                IDs.append("ficNoDialog2____" + fname.split('/')[-1])
            except:
                if with_dial >= dial*200:
#                    print("Have already reached the limit for with-dialogs: {} {}\tSkip!".format(with_dial, dial*200))
                    continue
#                print("Could not find zero-dialog passages in {} | Try for with-dialogue..".format(fname.split('/')[-1]))
                ret = sample_texts(fname+'.tokens', dialog=True)
                if ret == -1:
                    print("Returned -1. Skip!")
                    continue
                X.append(ret[0][0])
                X.append(ret[0][1])
                pct_quoted_fic.append(ret[1][0])
                pct_quoted_fic.append(ret[1][1])
                with_dial += 2
                IDs.append("ficWithDialog1____" + fname.split('/')[-1])
                IDs.append("ficWithDialog2____" + fname.split('/')[-1])
        
        elif with_dial < dial*200: # look for passages with-dialog
            ret = sample_texts(fname+'.tokens', dialog=True)
            try:
                X.append(ret[0][0])
                X.append(ret[0][1])
                pct_quoted_fic.append(ret[1][0])
                pct_quoted_fic.append(ret[1][1])
                with_dial += 2
                IDs.append("ficWithDialog1____" + fname.split('/')[-1])
                IDs.append("ficWithDialog2____" + fname.split('/')[-1])
            except:
                print(fname, "does not have quoted words.. Skip!")
                continue
                
        Y.append("fic")
        Y.append("fic")
#        print("With dial: {} | Without dial: {} | Pct quoted: {} | Y: {} | X: {}".format(with_dial, without_dial, len(pct_quoted_fic), len(Y), len(X))) 
#    print("End of fiction-fnames! X: {} | Y: {}".format(len(X), len(Y)))

    for fname in non_fiction_fnames: # all random
        IDs.append("non1____" + fname.split('/')[-1])
        IDs.append("non2____" + fname.split('/')[-1])
        fname = NON_BOOKNLP_PATH + fname.split('/')[-1] + '/' + fname.split('/')[-1] + '.tokens'
        X.extend(sample_random_text(fname, two_passages=True))
        Y.extend(["non", "non"])
    
    assert with_dial == dial*200 == len(pct_quoted_fic)
    assert without_dial == no_dial*200
    assert len(X) == len(Y) == len(IDs) == 400
    
    if return_ids:
        return np.array(X), np.array(Y), np.array(pct_quoted_fic), np.array(IDs)
    else:
        return np.array(X), np.array(Y), np.array(pct_quoted_fic)
    
    
#### Test-Data ####

def load_test_fnames():
    """
    Returns a list of filenames to be used as test-data.
    Test Data for all cases: 200 docs (100 "Non" & 100 fiction: 50 "with dialog" + 50 "without dialog")
    """
    test_path = DATA_PATH + 'Test/'
    fiction_fnames = [test_path+'NovelEnglish_Mystery/'+fname for fname in os.listdir(test_path+'NovelEnglish_Mystery/')]
    non_fiction_fnames = [test_path+'NonNovel_English_Contemporary_Mixed/'+fname for fname in os.listdir(test_path+'NonNovel_English_Contemporary_Mixed/')]
    print("Test Fiction fnames:", len(fiction_fnames), "| Test Non-Fiction fnames:", len(non_fiction_fnames))
    
    return fiction_fnames, non_fiction_fnames


def load_test_data():
    """
    Returns X and Y for test set. Also returns a corresponding list of IDs.
    100 random non-fiction passages + 50 fiction passages with-dialog + 50 fiction passages without-dialog
    
    Each passage is contiguous 500-words from the volume. Uses one passage per volume.
    """
    fiction_fnames, non_fiction_fnames = load_test_fnames()

    assert len(fiction_fnames) == len(non_fiction_fnames) == 100
    
    X, Y, IDs = [], [], [] # corresponding list of texts, labels, and unique IDs

    with_dial, without_dial = 0, 0
    pct_quoted_fic = []
    
    for fname in fiction_fnames:
        fname = FIC_BOOKNLP_PATH + fname.split('/')[-1] + '/' + fname.split('/')[-1]

        if without_dial < 50:
            try:
                X.append(sample_texts(fname+'.tokens', dialog=False)[0])
                without_dial += 1
                IDs.append("ficNoDialog____" + fname.split('/')[-1])
            except:
#                print("Could not find zero-dialog passages in {} | Try for with-dialogue..".format(fname.split('/')[-1]))
                ret = sample_texts(fname+'.tokens', dialog=True)
                X.append(ret[0][1])
                pct_quoted_fic.append(ret[1][1])
                with_dial += 1
                IDs.append("ficWithDialog____" + fname.split('/')[-1])
        
        else:
            ret = sample_texts(fname+'.tokens', dialog=True)
            try:
                X.append(ret[0][1])
                pct_quoted_fic.append(ret[1][1])
                with_dial += 1
                IDs.append("ficWithDialog____" + fname.split('/')[-1])
            except:
#                print(fname, "does not have quoted words. Skip!")
                continue
                
        Y.append("fic")
#        print("With dial: {} | Without dial: {} | Pct quoted: {} | Y: {}".format(with_dial, without_dial, len(pct_quoted_fic), len(Y))) 
        if with_dial == without_dial == 50:
            break

    for fname in non_fiction_fnames: # random passages
        IDs.append("non____" + fname.split('/')[-1])
        fname = NON_BOOKNLP_PATH + fname.split('/')[-1] + '/' + fname.split('/')[-1] + '.tokens'
        X.append(sample_random_text(fname))
        Y.append("non")

    assert with_dial == without_dial == len(pct_quoted_fic) == 50
    assert len(X) == len(Y) == len(IDs) == 200
    return np.array(X), np.array(Y), np.array(pct_quoted_fic), np.array(IDs)

In [3]:
# Inspired from: https://mccormickml.com/2019/07/22/BERT-fine-tuning/
import torch
import random, time, datetime
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def prepare_dataloader(texts, labels, IDs=[], batch_size=8, max_length=512):
    """
    Takes as input: texts, labels, and corresponding IDs (in case of test-data)
    This function returns a DataLoader object.

    For train_dataloader, labels are passed. For test_dataloader, both labels and IDs are passed.
    BERT tokenizer is used to
      (1) Tokenize the sentence.
      (2) Prepend the `[CLS]` token to the start.
      (3) Append the `[SEP]` token to the end.
      (4) Map tokens to their IDs.
      (5) Pad or truncate the sentence to `max_length`
      (6) Create attention masks for [PAD] tokens.
    Authors recommend a batch size of 16/32 for fine-tuning.
    """
    input_ids = []; attention_masks = []

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    for sent in texts:
        encoded_dict = tokenizer.encode_plus(sent, # sentence to encode
                                             add_special_tokens=True, # add '[CLS]' and '[SEP]'
                                             truncation=True,
                                             max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, # construct attention masks
                                             return_tensors='pt') # return pytorch tensorss


        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask']) # simply differentiates padding from non-padding

    # Convert to tensors:
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    if IDs == []: # for training data
        dataset = TensorDataset(input_ids, attention_masks, labels)
        print("Dataset has input_ids, attention_masks, labels | Length:", len(dataset))
        
    else: # for test data
        IDs = torch.tensor(IDs)
        print("Dataset has input_ids, attention_masks, labels, and IDs")
        dataset = TensorDataset(input_ids, attention_masks, labels, IDs)
        assert len(dataset) == 200

    data_loader = DataLoader(dataset,
                             sampler=RandomSampler(dataset),
                             batch_size=batch_size)

    print("Input IDs:", input_ids.shape)
    print("Dataset size:", len(dataset))
    return data_loader


def train(data_loader, epochs=3):
    """
    Given the data_loader, it fine-tunes BERT for the specific task.
    The BERT authors recommend between 2 and 4 training epochs.

    Returns fine-tuned BERT model.
    """
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    total_steps = len(data_loader) * epochs # total number of training steps is [number of batches] x [number of epochs]
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    total_t0 = time.time() # keep track of time

    for epoch_i in range(0, epochs):
        print('======== Epoch {:} / {:} ========'.format(epoch_i+1, epochs))
        t0 = time.time()
        total_train_loss = 0 # reset the total loss for this epoch
        model.train() # put the model into training mode

        for batch in data_loader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad() # clears any previously calculated gradients before performing a backward pass

            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip the norm of the gradients to 1.0 to help prevent the "exploding gradients" problem
            optimizer.step() # update parameters and take a step using the computed gradient
            scheduler.step() # update the learning rate

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(data_loader)
        training_time = format_time(time.time() - t0)

        print("\tAverage training loss: {0:.2f}".format(avg_train_loss))
        print("\tTraining epcoh took: {:}".format(training_time))
    print("\n\nTraining complete\nTotal training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    return model


def predict(model, data_loader):
    """
    Given the fine-tuned model and data loader, it returns flat predictions, list of prob(fiction), and corresponding true-labels & IDs.

    For predictions, we pick the label (0 or 1) with the higher score. The output for each batch are a 2-column ndarray (one column for "0"
    and one column for "1"). Pick the label with the highest value and turn this in to a list of 0s and 1s.
    """
    model.eval() # put model in evaluation mode

    predictions, prob_fiction, true_labels, IDs = [], [], [], []

    for batch in data_loader:
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_IDs = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()
        ids = b_IDs.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(labels)
        IDs.append(ids)


    flat_predictions = np.concatenate(predictions, axis=0)

    probs = torch.nn.functional.softmax(torch.from_numpy(flat_predictions), dim=-1) # convert logits to probabilities
    prob_fiction = probs[:,1] # because order is [0,1] and 1 is fiction
    prob_fiction = prob_fiction.numpy()

    flat_predictions = np.argmax(flat_predictions, axis=1).flatten() # pick the one with the highest value

    flat_true_labels = np.concatenate(true_labels, axis=0)
    flat_IDs = np.concatenate(IDs, axis=0)

    return flat_predictions, prob_fiction, flat_true_labels, flat_IDs

# If there's a GPU available...
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
import numpy as np
from sklearn.metrics import f1_score #, precision_score, recall_score, accuracy_score, average_precision_score

def run_bert():
    """
    Runs the BERT model:
    1) Prepares data loaders.
    2) Fine-tunes the BERT model.
    3) Returns the predictions on the test set.
    """
    # DataLoader:
    train_dataloader = prepare_dataloader(texts=X_train, labels=labels_train)
    
    print("Beginning training now..")
    # Train/fine-tune:
    bert_model = train(train_dataloader)

    # Predict on test set:
    test_dataloader = prepare_dataloader(texts=X_test, labels=labels_test, IDs=testIDs_idx)
    predictions, prob_fiction, true_labels, IDs_idx = predict(bert_model, test_dataloader)
    print("Predictions: {}\n\nLabels:{}\n\nIDs_idx:{}".format(predictions, true_labels, IDs_idx))
    print("\n\n\n\nF1=", f1_score(true_labels, predictions, pos_label=1))
    write_predictions(IDs_idx, prob_fiction, predictions)



def write_predictions(IDs_idx, prob_fiction, predictions):
    # Save predictions:
    print("Write predictions to:", preds_path)

    with open(preds_path, 'w') as f:
        f.write('fname\tprobability_fiction\tprediction\n')
        for index, prob, pred in zip(IDs_idx, prob_fiction, predictions):
            ID = test_IDs[int(index)]

            if prob >= 0.5:
                f.write(ID+'\t'+str(prob)+'\tfic\n')
                assert pred == 1
            else:
                f.write(ID+'\t'+str(prob)+'\tnon\n')
                assert pred == 0


def labels_str_to_int(Y):
    """
    Given the input labels, it converts them to integeres (fiction: 1 | non-fiction: 0)
    """
    labels = []
    for l in Y:
        if l == 'fic':
            labels.append(1)
        elif l == 'non':
            labels.append(0)
        else:
            print("Error:", l)
    return labels

In [15]:
DIALOG = 0

nondial = 100 - DIALOG
preds_path = HOME_PATH + 'BERT_predictions_dial_'+str(DIALOG)+'.tsv'
print("Write predictions to:", preds_path)


print("Running BERT for dialog: {}% and non-dialog: {}%".format(DIALOG, nondial))
X_train, Y_train, pct_quoted_fic, train_IDs = load_train_data(dial=DIALOG/100, no_dial=nondial/100)
t = [i.split('____')[0] for i in train_IDs]
print("X_train: {} | Y_train: {} | Y Distribution: {} | Dialog Dist: {}".format(len(X_train), len(Y_train), Counter(Y_train), Counter(t)))
assert len(X_train) == len(Y_train) == 400

X_train = X_train.tolist(); Y_train = Y_train.tolist() # convert to list
labels_train = labels_str_to_int(Y_train) # convert labels to integers

# Test data:
X_test, Y_test, pct_quoted_fic_test, test_IDs = load_test_data()
t = [i.split('____')[0] for i in test_IDs]
print("Test Set ---- X: {} | Y: {} | Distribution: {} | Dialog dist in test: {} | Test IDs: {}, preview: {}".format(len(X_test), len(Y_test), Counter(Y_test), Counter(t), len(test_IDs), test_IDs[:3]))
assert len(X_test) == len(Y_test) == 200

X_test = X_test.tolist(); Y_test = Y_test.tolist(); test_IDs = test_IDs.tolist() # convert to list
labels_test = labels_str_to_int(Y_test) # convert labels to integers
testIDs_idx = np.linspace(0, len(test_IDs), len(test_IDs), False) # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs
print("testIDs indexes:", len(testIDs_idx))

run_bert()

Write predictions to: /content/gdrive/My Drive/txtLAB-2020/bert-dialog/BERT_predictions_dial_0.tsv
Running BERT for dialog: 0% and non-dialog: 100%
Train Fiction fnames: 135 | Train Non-Fiction fnames: 100
X_train: 400 | Y_train: 400 | Y Distribution: Counter({'fic': 200, 'non': 200}) | Dialog Dist: Counter({'ficNoDialog1': 100, 'ficNoDialog2': 100, 'non1': 100, 'non2': 100})
Test Fiction fnames: 100 | Test Non-Fiction fnames: 100
Test Set ---- X: 200 | Y: 200 | Distribution: Counter({'fic': 100, 'non': 100}) | Dialog dist in test: Counter({'non': 100, 'ficNoDialog': 50, 'ficWithDialog': 50}) | Test IDs: 200, preview: ['ficNoDialog____2012_Shapiro,BA_TheArtForger_MY.txt'
 'ficNoDialog____2011_Aaronovitch,Ben_MidnightRiot_MY.txt'
 'ficNoDialog____2013_Crouch,Blake_Wayward_MY.txt']
testIDs indexes: 200




Dataset has input_ids, attention_masks, labels | Length: 400
Input IDs: torch.Size([400, 512])
Dataset size: 400
Beginning training now..


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

	Average training loss: 0.39
	Training epcoh took: 0:00:47
	Average training loss: 0.11
	Training epcoh took: 0:00:44
	Average training loss: 0.03
	Training epcoh took: 0:00:45


Training complete
Total training took 0:02:15 (h:mm:ss)




Dataset has input_ids, attention_masks, labels, and IDs
Input IDs: torch.Size([200, 512])
Dataset size: 200
Predictions: [0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 1 1 0 1
 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0]

Labels:[1 1 1 1 1 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0
 0 1 1 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1
 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0 1
 1 1 0 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0
 0 0 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0
 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0]

IDs_idx:[ 65.  80.  16.  50.  72.  53. 174. 127.  47. 14