### BERT for Genre experiments (with and without augmentation)
- We used Google Colaboratory for free GPU access.

In [None]:
HOME_PATH = '/content/gdrive/My Drive/txtLAB-2020/bert-run/'
import os
os.listdir(HOME_PATH)

['train_fnames_scenario_dict_exp_1.pickle',
 'data',
 'stanford-ner-4.0.0',
 'train_fnames_scenario_dict_exp_2.pickle',
 'train_fnames_scenario_dict_exp_3.pickle',
 'train_fnames_scenario_dict_exp_4.pickle',
 'train_fnames_scenario_dict_exp_5.pickle',
 'train_fnames_scenario_dict_exp_6.pickle',
 'words-500',
 'words-1000',
 'words-2000',
 'words-5000',
 'words-10000']

In [None]:
import os
import numpy as np
import random; random.seed(41)
from nltk.tokenize import word_tokenize
import pickle

DATA_PATH = HOME_PATH + 'data/'


def get_passage(fname, N, two_passages=False):
    """
    Returns a (continuous) passage of N words from the given txt/fname.
    If 'two_passages' is True, it returns two passages (list) instead of one.

    Note that the beginning and end (20%) of the txt is skipped.
    """
    with open(fname, 'r') as f:
        text = f.read()

    all_words = word_tokenize(text)
    start = int(0.2*len(all_words))
    end = int(len(all_words) - 0.2*len(all_words))

    # print("Total words: {} | Preview: {}".format(len(all_words), all_words[10:12]))
    # print("Start:", start, "| End:", end)

    if two_passages:
        #assert start+N+N < end
        if start+N+N > end:
            print("Not enough words.. using all the ones avaialable. Total words: {} | Start: {} | End: {}".format(len(all_words), start, end))
            words1 = all_words[start:start+N]
            words2 = all_words[start+N:]

        else:
            words1 = all_words[start:start+N]
            words2 = all_words[start+N:start+N+N]
        # print("Words1: {} | Words2: {}".format(len(words1), len(words2)))
        return [' '.join(words1), ' '.join(words2)]
    else:
        words = all_words[start:start+N]
        # print("Words:", len(words))
        return ' '.join(words)


######## Train Set ########
def load_train_data(scenario, return_ids=False):
    """
    Returns X and Y for training, given the scenario. Also returns the IDs if flag is set to True.
    Note: loads two 500-word instances per "Non-Fiction" volume.
    """
    # Load train fnames:
    fiction_fnames = TRAIN_FNAMES[scenario]['fiction_fnames']
    non_fiction_fnames = TRAIN_FNAMES[scenario]['non_fiction_fnames']

    if len(fiction_fnames) != 200 and scenario != 'A': # because scenario A has 201 (67+67+67) fnames
        N_two_fic = 200 - len(fiction_fnames)
        print("We have {} fiction fnames".format(len(fiction_fnames)))

    else:
        print("We have exactly {} fiction fnames.".format(len(fiction_fnames)))
        N_two_fic = 0

    assert len(non_fiction_fnames) == 100

    print("Intersection between fic and nonfic fnames:", set(fiction_fnames).intersection(set(non_fiction_fnames)))
    X = [] # list of training texts
    Y = [] # corresponding list of training labels
    IDs = [] # corresponding list of unique IDs

    if N_two_fic != 0:
        print("Getting 2 passages from {} files".format(N_two_fic))
        for fname in fiction_fnames[:N_two_fic]:
            print("Get 2 passages from:", fname)
            fname = fname.replace("/Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/", DATA_PATH)
            X.extend(get_passage(fname, N_WORDS, two_passages=True))
            Y.append("fic")
            Y.append("fic")
            IDs.append(fname.split('/')[-1][:-4]+'__1')
            IDs.append(fname.split('/')[-1][:-4]+'__2')
    
    print("X: {} | Y: {}".format(len(X), len(Y)))

    for fname in fiction_fnames[N_two_fic:]:
        fname = fname.replace("/Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/", DATA_PATH)
        X.append(get_passage(fname, N_WORDS))
        Y.append("fic")
        IDs.append(fname.split('/')[-1][:-4])

    for fname in non_fiction_fnames: # need two "passages" per txt
        fname = fname.replace("/Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/", DATA_PATH)
        X.extend(get_passage(fname, N_WORDS, two_passages=True))
        Y.append("non")
        Y.append("non")
        IDs.append(fname.split('/')[-1][:-4]+'__1')
        IDs.append(fname.split('/')[-1][:-4]+'__2')

    if return_ids:
        return np.array(X), np.array(Y), np.array(IDs)
    else:
        return np.array(X), np.array(Y)


######## Test Set ########
def load_test_fnames():
    """
    Returns a list of filenames to be used as test-data.
    Test Data for all cases: 198 docs (99 "Non" & 100 fiction: 33 "Mys" + 33 "Rom" + 33 "SciFi")
    """
    test_path = DATA_PATH + '/Test-Set/'

    mys = [test_path+'Mystery_TestSet/'+fname for fname in os.listdir(test_path+'Mystery_TestSet/')]
    rom = [test_path+'Romance_TestSet/'+fname for fname in os.listdir(test_path+'Romance_TestSet/')]
    sci = [test_path+'SciFi_TestSet/'+fname for fname in os.listdir(test_path+'SciFi_TestSet/')]
    fiction_fnames = mys + sci + rom
    random.shuffle(fiction_fnames)

    non_fiction_fnames = [test_path+'NonNovel_TestSet/'+fname for fname in os.listdir(test_path+'NonNovel_TestSet/')]
    print("Test Fiction fnames:", len(fiction_fnames), "| Test Non-Fiction fnames:", len(non_fiction_fnames))
    return fiction_fnames, non_fiction_fnames


def load_test_data():
    """
    Returns X and Y for test set. Also returns a corresponding list of IDs.
    """
    fiction_fnames, non_fiction_fnames = load_test_fnames()

    X = [] # list of texts
    Y = [] # corresponding list of labels
    IDs = [] # corresponding list of unique IDs

    for fname in fiction_fnames:
        IDs.append(fname.split('/')[-1])
        X.append(get_passage(fname, N_WORDS))
        Y.append("fic")

    for fname in non_fiction_fnames:
        IDs.append(fname.split('/')[-1])
        X.append(get_passage(fname, N_WORDS))
        Y.append("non")

    return np.array(X), np.array(Y), np.array(IDs)

In [None]:
# ######## Train Set with Augmentation ########
# def load_train_data_with_EDA(scenario, N_aug=16):
#     """
#     Returns X and Y for training, given the scenario. Data is augmented 16 folds using one of the four EDA techniques at random.
#     """
#     print("Generate {} new instances per instance using EDA".format(N_aug))
#     X, Y = load_train_data(scenario)

#     augmented_X = X.tolist()
#     augmented_Y = Y.tolist()

#     operations = [synonym_replacement, random_insertion, random_swap, random_deletion]
#     for instance, label in zip(X, Y):
#         print("X so far:", len(augmented_X), "| Y so far:", len(augmented_Y))#, "| Y:", augmented_Y)
#         for _ in range(N_aug):
#             operation = random.choice(operations)
#             new_text = operation(instance)
#             augmented_X.append(new_text)
#             augmented_Y.append(label)

#     return np.array(augmented_X), np.array(augmented_Y)


# # Easy Data Augmentation (EDA) techniques from https://www.aclweb.org/anthology/D19-1670.pdf
# # As recommended in Table 3, we use alpha=0.05 (for RD, p=alpha); n = 25 | generate 16 instances per training instance
# # Note that in order to generate a new instance, randomly choose and perform one of the four EDA operations

# import nlpaug.augmenter.word as naw
# import nlpaug.model.word_dict as nmw
# import random; random.seed(41)
# import nltk
# from nltk.corpus import stopwords

# english_stopwords = stopwords.words('english')
# ALPHA = 0.05
# N = int(ALPHA*500)
# print("EDA Parameters: N = {} | alpha = {}".format(N, ALPHA))

# # SR:
# def synonym_replacement(text, n=N):
#     """
#     Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its
#     synonyms chosen at random.
#     """
#     aug = naw.SynonymAug(aug_src='wordnet', aug_min=n, aug_max=n, stopwords=english_stopwords)
#     augmented_text = aug.augment(text)
#     return augmented_text

# # ------------------- #

# # RS:
# def random_swap(text):
#     """
#     Performs random swap N times.
#     """
#     for i in range(N):
#         text = random_swap_helper(text)
#         # print("After run {}, text is {}".format(i+1, text))
#     return text

# def random_swap_helper(text):
#     """
#     Randomly choose two words in the sentence and swap their positions.
#     """
#     aug = naw.RandomWordAug(action='swap', aug_min=1, aug_max=1)
#     augmented_text = aug.augment(text)
#     return augmented_text

# # ------------------- #

# # RD:
# def random_deletion(text, p=ALPHA):
#     """
#     Randomly remove each word in the sentence with probability p=0.05
#     """
#     aug = naw.RandomWordAug(action='delete', aug_p=p)
#     augmented_text = aug.augment(text)
#     return augmented_text

# # ------------------- #

# # RI:
# def random_insertion(text):
#     """
#     Performs random insertion N times.
#     """
#     for i in range(N):
#         text = random_insertion_helper(text)
#         # print("After run {}, text is: {}".format(i+1, text))
#     return text

# def random_insertion_helper(text):
#     """
#     Find a random synonym of a random word in the sentence that is not a stop word.
#     Insert that synonym into a random position in the sentence.
#     """
#     original_words = nltk.word_tokenize(text)

#     # pick a random word and get its synonyms:
#     candidate_syns, candidate_word = get_random_words_synonyms(original_words)

#     # pick a random synonym:
#     final_synonym = random.choice(candidate_syns)

#     # insert at a random position:
#     rand_index = random.randint(0, len(original_words)-1)
#     original_words.insert(rand_index, final_synonym)

#     # print("Original word:", candidate_word)
#     # print("Final synonym:", final_synonym)

#     return ' '.join(original_words)

# def get_random_words_synonyms(original_words):
#     """
#     Helper for RI: picks a random word in 'original_words' which is not a stopword. Returns a list of its synonyms (and the word).
#     """
#     model = nmw.WordNet(lang='eng', is_synonym=True)
#     filtered_words = [w for w in original_words if w not in english_stopwords] # remove stopwords
#     while True:
#         candidate_word = random.choice(filtered_words)
#         # print("Candidate:", candidate_word)
#         candidate_syns = model.predict(candidate_word)
#         # print("Before:", candidate_syns)
#         if candidate_word in candidate_syns: # remove all occurrences of candidate_word in candidate_syns
#             candidate_syns = list(filter(lambda a: a != candidate_word, candidate_syns))
#             # print("After:", candidate_syns)
#         if candidate_syns: # return, if not empty
#             return candidate_syns, candidate_word

In [None]:
# # Our Custom Data Augmentation (CDA) technique involves (1) Back-Translation, (2) Crossover, (3) Substituting & Deleting Proper Names | generate 16 instances per training instance

# from nltk.tag import StanfordNERTagger
# from nltk.tokenize import word_tokenize
# import random; random.seed(41)


# # Using the first-names "all" version & surnames "us" version from https://github.com/smashew/NameDatabases
# with open(HOME_PATH+'data/names/first_names_all.txt', 'r', errors='ignore', encoding='utf8') as r:
#     FIRST_NAMES = set(r.read().strip().split('\n'))
    
# with open(HOME_PATH+'data/names/surnames_us.txt', 'r', errors='ignore', encoding='utf8') as r:
#     LAST_NAMES = set(r.read().strip().split('\n'))
    
# print("We have unique {} first names and {} last names".format(len(FIRST_NAMES), len(LAST_NAMES)))


# TAGGER = StanfordNERTagger(HOME_PATH+'stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser.gz',
#                            HOME_PATH+'stanford-ner-4.0.0/stanford-ner-4.0.0.jar',
#                            encoding='utf-8')


# def proper_name_present(text):
#     """
#     Returns -1 if proper name is not present in the given text.
#     If present, returns the NER tagged text.
#     """    
#     classified_text = TAGGER.tag(word_tokenize(text)) # NER Tagging

#     for word, tag in classified_text:
#         if tag == 'PERSON':
#             return classified_text
        
#     # if no tag in the loop is 'PERSON'
#     return -1


# def proper_names(classified_text, action):
#     """
#     Given the NER-classified text, we can perform two actions: 'delete' or 'substitute'.
#     - deletes all proper names
#     - substitutes all proper names with random names from https://github.com/smashew/NameDatabases
#     """
#     augmented_text = ""
    
#     for i, tup in enumerate(classified_text):
#         word, tag = tup
#         surname = False; first_name = False
#         if tag == 'PERSON': # for substitue (need to figure out first/surname)
#             if action == 'delete':
#                 continue
            
#             elif action == 'substitute':
#                 if classified_text[i-1][1] == 'PERSON':
#                     surname = True
#                 else:
#                     first_name = True

#                 if first_name: # randomly substitute one
#                     augmented_text += " " + random.sample(FIRST_NAMES, 1)[0]
#                 elif surname:
#                     augmented_text += " " + random.sample(LAST_NAMES, 1)[0]

#         else:
#             augmented_text += " " + word
    
#     return augmented_text


# def back_translation(fname):
#     """
#     Given the fname, this funciton returns 4 back-translated passages (French, German, Korean, Spanish).
    
#     See Back-Translation notebook for the translation details.
#     """
#     path = HOME_PATH+'data/back-translated/'
#     languages = ['fr', 'ko', 'de', 'es'] # French, Korean, German, Spanish
    
#     texts = []
#     for lang in languages:
#         with open(path+fname+'__lang_'+lang+'.txt', 'r') as f:
#             t = f.read()
#         texts.append(t)
#     return texts


# def crossover(text1, text2):
#     """
#     Returns a new text instance after performing crossover (index: half of text1).
#     First half of text1 + second half of text 2
#     """
#     text1 = text1.split(' ')
#     text2 = text2.split(' ')
    
#     i = int(len(text1)/2)
            
#     text1_part1 = text1[:i]
#     text1_part2 = text1[i:]
    
#     text2_part1 = text2[:i]
#     text2_part2 = text2[i:]
    
#     new_text = text1_part1 + text2_part2
    
#     return ' '.join(new_text)


# def perform_crossover(main_text, second_texts):
#     """
#     Crossovers the given main_text with each of the second_texts.
#     Return len(second_texts) augmented instances
#     """
#     assert len(second_texts) <= 16
#     X = []
    
#     for second in second_texts:
#         X.append(crossover(main_text, second))
        
#     return X


# def load_train_data_with_CDA(scenario):
#     """
#     Returns X and Y for training, given the scenario. Data is augmented 16 folds using our CDA technique.
#     | Back translation (4) | Crossover (10 or 12) | Proper Name substitution & deletion (2 or 0) |
#     """
#     print("Generate 16 new instances per instance using CDA")
#     X, Y, IDs = load_train_data(scenario, return_ids=True)

#     augmented_X = X.tolist()
#     augmented_Y = Y.tolist()
    
#     for instance, label, ID in zip(X, Y, IDs):
#         print("X so far:", len(augmented_X), "| Y so far:", len(augmented_Y))#, "| Y:", augmented_Y)
        
#         # BackTranslation: 4 back-translated augmented instances:
#         fname = ID + '__' + label
#         translated = back_translation(fname)
#         augmented_X.extend(translated)
        
#         # Check for Proper Names
#         ner_tagged = proper_name_present(instance)
#         if ner_tagged == -1:
#             print("No proper names present in", ID)
#             N_crossover = 12
        
#         else: # if Proper Names exist, get 2 augmented instances (substitute all/delete all)
#             pn_deleted = proper_names(ner_tagged, action='delete')
#             pn_substituted = proper_names(ner_tagged, action='substitute')
#             augmented_X.append(pn_deleted)
#             augmented_X.append(pn_substituted)
#             N_crossover = 10
        
#         # Get the rest (10 or 12) crossover augmented instances:
#         random_X = random.sample(list(X), N_crossover)
#         crossed = perform_crossover(instance, random_X)
#         augmented_X.extend(crossed)
        
#         # Add labels:
#         augmented_Y.extend([label]*16)
        
#     return np.array(augmented_X), np.array(augmented_Y)

In [None]:
# Inspired from: https://mccormickml.com/2019/07/22/BERT-fine-tuning/
import torch
import random, time, datetime
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def prepare_dataloader(texts, labels, IDs=[], batch_size=8, max_length=512):
    """
    Takes as input: texts, labels, and corresponding IDs (in case of test-data)
    This function returns a DataLoader object.

    For train_dataloader, labels are passed. For test_dataloader, both labels and IDs are passed.
    BERT tokenizer is used to
      (1) Tokenize the sentence.
      (2) Prepend the `[CLS]` token to the start.
      (3) Append the `[SEP]` token to the end.
      (4) Map tokens to their IDs.
      (5) Pad or truncate the sentence to `max_length`
      (6) Create attention masks for [PAD] tokens.
    Authors recommend a batch size of 16/32 for fine-tuning.
    """
    input_ids = []; attention_masks = []

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    for sent in texts:
        encoded_dict = tokenizer.encode_plus(sent, # sentence to encode
                                             add_special_tokens=True, # add '[CLS]' and '[SEP]'
                                             truncation=True,
                                             max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, # construct attention masks
                                             return_tensors='pt') # return pytorch tensorss


        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask']) # simply differentiates padding from non-padding

    # Convert to tensors:
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    if IDs == []: # for training data
        dataset = TensorDataset(input_ids, attention_masks, labels)
        print("Dataset has input_ids, attention_masks, labels | Length:", len(dataset))
        
    else: # for test data
        IDs = torch.tensor(IDs)
        print("Dataset has input_ids, attention_masks, labels, and IDs")
        dataset = TensorDataset(input_ids, attention_masks, labels, IDs)
        assert len(dataset) == 198

    data_loader = DataLoader(dataset,
                             sampler=RandomSampler(dataset),
                             batch_size=batch_size)

    print("Input IDs:", input_ids.shape)
    print("Dataset size:", len(dataset))
    return data_loader


def train(data_loader, epochs=3):
    """
    Given the data_loader, it fine-tunes BERT for the specific task.
    The BERT authors recommend between 2 and 4 training epochs.

    Returns fine-tuned BERT model.
    """
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    total_steps = len(data_loader) * epochs # total number of training steps is [number of batches] x [number of epochs]
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    total_t0 = time.time() # keep track of time

    for epoch_i in range(0, epochs):
        print('======== Epoch {:} / {:} ========'.format(epoch_i+1, epochs))
        t0 = time.time()
        total_train_loss = 0 # reset the total loss for this epoch
        model.train() # put the model into training mode

        for batch in data_loader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad() # clears any previously calculated gradients before performing a backward pass

            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip the norm of the gradients to 1.0 to help prevent the "exploding gradients" problem
            optimizer.step() # update parameters and take a step using the computed gradient
            scheduler.step() # update the learning rate

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(data_loader)
        training_time = format_time(time.time() - t0)

        print("\tAverage training loss: {0:.2f}".format(avg_train_loss))
        print("\tTraining epcoh took: {:}".format(training_time))
    print("\n\nTraining complete\nTotal training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    return model


def predict(model, data_loader):
    """
    Given the fine-tuned model and data loader, it returns flat predictions, list of prob(fiction), and corresponding true-labels & IDs.

    For predictions, we pick the label (0 or 1) with the higher score. The output for each batch are a 2-column ndarray (one column for "0"
    and one column for "1"). Pick the label with the highest value and turn this in to a list of 0s and 1s.
    """
    model.eval() # put model in evaluation mode

    predictions, prob_fiction, true_labels, IDs = [], [], [], []

    for batch in data_loader:
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_IDs = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()
        ids = b_IDs.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(labels)
        IDs.append(ids)


    flat_predictions = np.concatenate(predictions, axis=0)

    probs = torch.nn.functional.softmax(torch.from_numpy(flat_predictions), dim=-1) # convert logits to probabilities
    prob_fiction = probs[:,1] # because order is [0,1] and 1 is fiction
    prob_fiction = prob_fiction.numpy()

    flat_predictions = np.argmax(flat_predictions, axis=1).flatten() # pick the one with the highest value

    flat_true_labels = np.concatenate(true_labels, axis=0)
    flat_IDs = np.concatenate(IDs, axis=0)

    return flat_predictions, prob_fiction, flat_true_labels, flat_IDs

# If there's a GPU available...
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
import numpy as np
from sklearn.metrics import f1_score #, precision_score, recall_score, accuracy_score, average_precision_score

def run_bert():
    """
    Runs the BERT model:
    1) Prepares data loaders.
    2) Fine-tunes the BERT model.
    3) Returns the predictions on the test set.
    """
    # DataLoader:
    train_dataloader = prepare_dataloader(texts=X_train, labels=labels_train)
    
    print("Beginning training now..")
    # Train/fine-tune:
    bert_model = train(train_dataloader)

    # Predict on test set:
    test_dataloader = prepare_dataloader(texts=X_test, labels=labels_test, IDs=testIDs_idx)
    predictions, prob_fiction, true_labels, IDs_idx = predict(bert_model, test_dataloader)
    print("Predictions: {}\n\nLabels:{}\n\nIDs_idx:{}".format(predictions, true_labels, IDs_idx))
    print("\n\n\n\nF1=", f1_score(true_labels, predictions, pos_label=1))
    write_predictions(IDs_idx, prob_fiction, predictions)



def write_predictions(IDs_idx, prob_fiction, predictions):
    # Save predictions:
    print("Write predictions to:", preds_path)

    with open(preds_path, 'w') as f:
        f.write('fname\tprobability_fiction\tlabel\n')
        for index, prob, pred in zip(IDs_idx, prob_fiction, predictions):
            ID = test_IDs[int(index)]

            if prob >= 0.5:
                f.write(ID+'\t'+str(prob)+'\tfic\n')
                assert pred == 1
            else:
                f.write(ID+'\t'+str(prob)+'\tnon\n')
                assert pred == 0


def labels_str_to_int(Y):
    """
    Given the input labels, it converts them to integeres (fiction: 1 | non-fiction: 0)
    """
    labels = []
    for l in Y:
        if l == 'fic':
            labels.append(1)
        elif l == 'non':
            labels.append(0)
        else:
            print("Error:", l)
    return labels

In [None]:
N_WORDS = 10000
experiment = 'train_fnames_scenario_dict_exp_6.pickle'
SCENARIO = 'D'

preds_path = HOME_PATH + 'words-' + str(N_WORDS) + '/' + experiment.split('.')[0].split('dict_')[1]+'_BERT_preds_for_Case_'+SCENARIO+'.tsv'
print("Write predictions to:", preds_path)

with open(HOME_PATH+experiment, 'rb') as f: # contains training novels for each scenario
    TRAIN_FNAMES = pickle.load(f)


# Load training data:
# if CDA: # with CDA
#     X_train, Y_train = load_train_data_with_CDA(SCENARIO)

# else: # wihtout any Data Augmentation
X_train, Y_train = load_train_data(SCENARIO)
X_train = X_train.tolist(); Y_train = Y_train.tolist() # convert to list
labels_train = labels_str_to_int(Y_train) # convert labels to integers

# Test data:
X_test, Y_test, test_IDs = load_test_data()
X_test = X_test.tolist(); Y_test = Y_test.tolist(); test_IDs = test_IDs.tolist() # convert to list

# Sanity check:
if SCENARIO == 'A':
    assert len(X_train) == len(Y_train) == 401
else:
    assert len(X_train) == len(Y_train) == 400

assert len(X_test) == len(Y_test) == 198

labels_test = labels_str_to_int(Y_test) # convert labels to integers
testIDs_idx = np.linspace(0, len(test_IDs), len(test_IDs), False) # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs
print("testIDs indexes:", len(testIDs_idx))

run_bert()

Write predictions to: /content/gdrive/My Drive/txtLAB-2020/bert-run/words-10000/exp_6_BERT_preds_for_Case_D.tsv
We have 178 fiction fnames
Intersection between fic and nonfic fnames: set()
Getting 2 passages from 22 files
Get 2 passages from: /Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/NovelEnglish_Romance/2015_Lewis,RJ_Leah_ROM.txt
Get 2 passages from: /Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/NovelEnglish_Romance/2014_Raine,Alice_TheDarknessWithinHim_ROM.txt
Get 2 passages from: /Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/NovelEnglish_Romance/2015_Wood,Jessica_PromisetoMarry_ROM.txt
Not enough words.. using all the ones avaialable. Total words: 30496 | Start: 6099 | End: 24396
Get 2 passages from: /Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for-Literary-Data/data/NovelEnglish_Romance/2015_Scott,Ginger_WildReckless_ROM.txt
Get 2 passages from: /Users/sunyambagga/Desktop/txtLAB-2/Augmentation-for



Dataset has input_ids, attention_masks, labels | Length: 400
Input IDs: torch.Size([400, 512])
Dataset size: 400
Beginning training now..


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

	Average training loss: 0.27
	Training epcoh took: 0:00:40
	Average training loss: 0.06
	Training epcoh took: 0:00:41
	Average training loss: 0.01
	Training epcoh took: 0:00:43


Training complete
Total training took 0:02:05 (h:mm:ss)




Dataset has input_ids, attention_masks, labels, and IDs
Input IDs: torch.Size([198, 512])
Dataset size: 198
Predictions: [1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0
 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1
 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0
 0 0 1 1 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0
 1 0 0 0 0 1 1 1 1 1 0 1 1]

Labels:[1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1
 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 0 1 0
 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1
 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1
 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1
 1 0 0 1 0 1 1 1 1 1 0 1 1]

IDs_idx:[  3.  43.  68. 108. 140. 110. 162.  76. 148. 166. 188. 