### BERT for Gender experiments
- We used Google Colaboratory for free GPU access.

In [5]:
HOME_PATH = '/content/gdrive/My Drive/txtLAB-2020/bert-gender/'
import os
os.listdir(HOME_PATH)

['NovelEnglish_Contemporary_Meta.csv',
 'data',
 'BERT_predictions_Male_0.tsv',
 'BERT_predictions_Male_10.tsv',
 'BERT_predictions_Male_20.tsv',
 'BERT_predictions_Male_30.tsv',
 'BERT_predictions_Male_40.tsv',
 'BERT_predictions_Male_50.tsv']

In [6]:
# Dataset Loader for both train and test set
import numpy as np
import pandas as pd
import random; random.seed(41)
from nltk.tokenize import word_tokenize


DATA_PATH = HOME_PATH + 'data/'
df = pd.read_csv(HOME_PATH+'/NovelEnglish_Contemporary_Meta.csv')
df = df.loc[df['Author_Gender'].isin(['F','M'])]
GENDER_DICT = dict(zip(df.ID, df.Author_Gender))
print("{} files in Gender Dictionary.".format(len(GENDER_DICT)))


def get_passage(fname, two_passages=False, three_passages=False, N=500):
    """
    Returns a (continuous) passage of N words from the given txt/fname.
    If 'two_passages' (or three passages) is set to True, returns two (or three) passages in a list.
    
    Note that the beginning and end (20/30%) of the txt is skipped.
    """
    pct = 0.3
    with open(fname, 'r') as f:
        text = f.read()

    all_words = word_tokenize(text)
    start = int(pct*len(all_words))
    end = int(len(all_words) - pct*len(all_words))

    # print("Total words: {} | Preview: {}".format(len(all_words), all_words[10:12]))
    # print("Start:", start, "| End:", end)

    if two_passages:
        words1 = all_words[start:start+N]
        words2 = all_words[start+N:start+N+N]
#        print("Words1: {} | Words2: {}".format(len(words1), len(words2)))
        return [' '.join(words1), ' '.join(words2)]

    elif three_passages:
        words1 = all_words[start:start+N]
        words2 = all_words[start+N:start+N+N]
        words3 = all_words[start+N+N:start+N+N+N]
        print("Words1: {} | Words2: {} | Words3: {}".format(len(words1), len(words2), len(words3)))
        return [' '.join(words1), ' '.join(words2), ' '.join(words3)]

    else:
        words = all_words[start:start+N]
#        print("Words:", len(words))
        return ' '.join(words)


######## Train Set ########
def load_train_fnames():
    """
    Returns a list of filenames to be used for train-data.
    """
    fiction_fnames = [DATA_PATH+'Train/NovelEnglish_Mystery/'+fname for fname in os.listdir(DATA_PATH+'Train/NovelEnglish_Mystery/')]
    non_fiction_fnames = [DATA_PATH+'Train/NonNovel_English_Contemporary_Mixed/'+fname for fname in os.listdir(DATA_PATH+'Train/NonNovel_English_Contemporary_Mixed/')]
    print("Train Fiction fnames:", len(fiction_fnames), "| Train Non-Fiction fnames:", len(non_fiction_fnames))
    return fiction_fnames, non_fiction_fnames


def load_train_data(male_pct, return_ids=False):
    """
    Returns X and Y for training (400: 200 Fiction and 200 Non-Fiction) given the scenario. Also returns the IDs if flag is set to True.
    male_pct (between 0 & 1) represents the ratio of fiction passages written by male authors. Female = 1 - male_pct
    
    Note: loads 2-3 500-word instances per 'fiction' volume; for scenarios that don't have 200 fiction fnames, loads two instances for a few fnames.
    """
    fiction_fnames, non_fiction_fnames = load_train_fnames()
    
    MALE_FIC = male_pct*200
    FEMALE_FIC = 200 - MALE_FIC
    
    print("Target for Male Fiction: {} | Target for Female Fiction: {}".format(MALE_FIC, FEMALE_FIC))
    
    X = [] # list of training texts
    Y = [] # corresponding list of training labels
    IDs = [] # corresponding list of unique IDs
    
    male_fic_fnames, female_fic_fnames = [], []
    for fname in fiction_fnames:
        txt = fname.split('/')[-1]
        if GENDER_DICT[txt] == 'M':
            male_fic_fnames.append(fname)
        elif GENDER_DICT[txt] == 'F':
            female_fic_fnames.append(fname)
        else:
            print("Not possible!")

    N_three_fic_male = int(max(0, MALE_FIC-len(male_fic_fnames)*2))
    N_three_fic_female = int(max(0, FEMALE_FIC-len(female_fic_fnames)*2))

    male_counter, female_counter = 0, 0

    print("\nWe have {} male-fiction files and {} female-fiction files".format(len(male_fic_fnames), len(female_fic_fnames)))
    print("\n\nFor MALE: we need 2 passages from <= {} and 3 passages from {} files.".format(len(male_fic_fnames)-N_three_fic_male, N_three_fic_male))
    print("For FEMALE: we need 2 passages from <= {} and 3 passages from {} files.\n".format(len(female_fic_fnames)-N_three_fic_female, N_three_fic_female))

    if N_three_fic_male != 0:
        print("Get 3 passages from {} files: male".format(N_three_fic_male))
        for fname in male_fic_fnames[:N_three_fic_male]:
            g = GENDER_DICT[fname.split('/')[-1]]
            assert g == 'M'
            X.extend(get_passage(fname, three_passages=True))
            Y.extend(["fic", "fic", "fic"])
            IDs.append(g+'_fic_1____'+txt)
            IDs.append(g+'_fic_2____'+txt)
            IDs.append(g+'_fic_3____'+txt)
#            print(fname, "has gender ", g)
            male_counter += 3

    if N_three_fic_female != 0:
        print("Get 3 passages from {} files: female".format(N_three_fic_female))
        for fname in female_fic_fnames[:N_three_fic_female]:
            g = GENDER_DICT[fname.split('/')[-1]]
            assert g == 'F'
            X.extend(get_passage(fname, three_passages=True))
            Y.extend(["fic", "fic", "fic"])
            IDs.append(g+'_fic_1____'+txt)
            IDs.append(g+'_fic_2____'+txt)
            IDs.append(g+'_fic_3____'+txt)
#            print(fname, "has gender ", g)
            female_counter += 3

    for fname in male_fic_fnames[N_three_fic_male:]:
        if male_counter == MALE_FIC:
            print("Reached male target. Break", male_counter)
            break
        g = GENDER_DICT[fname.split('/')[-1]]
        assert g == 'M'
        X.extend(get_passage(fname, two_passages=True))
        Y.extend(["fic", "fic"])
        IDs.append(g+'_fic_1____'+txt)
        IDs.append(g+'_fic_2____'+txt)
#        print(fname, "has gender ", g)
        male_counter += 2


    for fname in female_fic_fnames[N_three_fic_female:]:
        if female_counter == FEMALE_FIC:
            print("Reached female target. Break", female_counter)
            break
        g = GENDER_DICT[fname.split('/')[-1]]
        assert g == 'F'
        X.extend(get_passage(fname, two_passages=True))
        Y.extend(["fic", "fic"])
        IDs.append(g+'_fic_1____'+txt)
        IDs.append(g+'_fic_2____'+txt)
#        print(fname, "has gender ", g)
        female_counter += 2


    for fname in non_fiction_fnames: # need two passages per txt
        X.extend(get_passage(fname, two_passages=True))
        Y.append("non")
        Y.append("non")
        IDs.append('non1____'+fname.split('/')[-1])
        IDs.append('non2____'+fname.split('/')[-1])

    if return_ids:
        return np.array(X), np.array(Y), np.array(IDs)
    else:
        return np.array(X), np.array(Y)


######## Test Set ########
def load_test_fnames():
    """
    Returns a list of filenames to be used as test-data.
    Test Data for all cases: 200 docs (100 "Non" & 100 fiction: 50 "Male" + 50 "Female")
    There are 25 'M' files and 25 'F' files in fiction. Take two passages from each fiction and one from non-fiction.
    """
    test_path = DATA_PATH + 'Test/'
    fiction_fnames = [test_path+'NovelEnglish_Mystery/'+fname for fname in os.listdir(test_path+'NovelEnglish_Mystery/')]
    non_fiction_fnames = [test_path+'NonNovel_English_Contemporary_Mixed/'+fname for fname in os.listdir(test_path+'NonNovel_English_Contemporary_Mixed/')]
    print("Test Fiction fnames:", len(fiction_fnames), "| Test Non-Fiction fnames:", len(non_fiction_fnames))
    
    return fiction_fnames, non_fiction_fnames


def load_test_data():
    """
    Returns X and Y for test set. Also returns a corresponding list of IDs.
    Take two passages from each fiction and one passage from non-fiction.
    """
    fiction_fnames, non_fiction_fnames = load_test_fnames()

    X = [] # list of texts
    Y = [] # corresponding list of labels
    IDs = [] # corresponding list of unique IDs

    for fname in fiction_fnames:
        txt = fname.split('/')[-1]
        g = GENDER_DICT[txt]
        X.extend(get_passage(fname, two_passages=True))
        Y.extend(["fic", "fic"])
        IDs.append(g+'_fic_1____'+txt)
        IDs.append(g+'_fic_2____'+txt)
        # print(txt, "has gender ", g)
    
    for fname in non_fiction_fnames:
        X.append(get_passage(fname))
        Y.append("non")
        IDs.append('non____'+fname.split('/')[-1])

    return np.array(X), np.array(Y), np.array(IDs)

2064 files in Gender Dictionary.


In [7]:
# Inspired from: https://mccormickml.com/2019/07/22/BERT-fine-tuning/
import torch
import random, time, datetime
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def prepare_dataloader(texts, labels, IDs=[], batch_size=8, max_length=512):
    """
    Takes as input: texts, labels, and corresponding IDs (in case of test-data)
    This function returns a DataLoader object.

    For train_dataloader, labels are passed. For test_dataloader, both labels and IDs are passed.
    BERT tokenizer is used to
      (1) Tokenize the sentence.
      (2) Prepend the `[CLS]` token to the start.
      (3) Append the `[SEP]` token to the end.
      (4) Map tokens to their IDs.
      (5) Pad or truncate the sentence to `max_length`
      (6) Create attention masks for [PAD] tokens.
    Authors recommend a batch size of 16/32 for fine-tuning.
    """
    input_ids = []; attention_masks = []

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    for sent in texts:
        encoded_dict = tokenizer.encode_plus(sent, # sentence to encode
                                             add_special_tokens=True, # add '[CLS]' and '[SEP]'
                                             truncation=True,
                                             max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, # construct attention masks
                                             return_tensors='pt') # return pytorch tensorss


        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask']) # simply differentiates padding from non-padding

    # Convert to tensors:
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    if IDs == []: # for training data
        dataset = TensorDataset(input_ids, attention_masks, labels)
        print("Dataset has input_ids, attention_masks, labels | Length:", len(dataset))
        
    else: # for test data
        IDs = torch.tensor(IDs)
        print("Dataset has input_ids, attention_masks, labels, and IDs")
        dataset = TensorDataset(input_ids, attention_masks, labels, IDs)
        assert len(dataset) == 200

    data_loader = DataLoader(dataset,
                             sampler=RandomSampler(dataset),
                             batch_size=batch_size)

    print("Input IDs:", input_ids.shape)
    print("Dataset size:", len(dataset))
    return data_loader


def train(data_loader, epochs=3):
    """
    Given the data_loader, it fine-tunes BERT for the specific task.
    The BERT authors recommend between 2 and 4 training epochs.

    Returns fine-tuned BERT model.
    """
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    total_steps = len(data_loader) * epochs # total number of training steps is [number of batches] x [number of epochs]
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    total_t0 = time.time() # keep track of time

    for epoch_i in range(0, epochs):
        print('======== Epoch {:} / {:} ========'.format(epoch_i+1, epochs))
        t0 = time.time()
        total_train_loss = 0 # reset the total loss for this epoch
        model.train() # put the model into training mode

        for batch in data_loader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad() # clears any previously calculated gradients before performing a backward pass

            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip the norm of the gradients to 1.0 to help prevent the "exploding gradients" problem
            optimizer.step() # update parameters and take a step using the computed gradient
            scheduler.step() # update the learning rate

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(data_loader)
        training_time = format_time(time.time() - t0)

        print("\tAverage training loss: {0:.2f}".format(avg_train_loss))
        print("\tTraining epcoh took: {:}".format(training_time))
    print("\n\nTraining complete\nTotal training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    return model


def predict(model, data_loader):
    """
    Given the fine-tuned model and data loader, it returns flat predictions, list of prob(fiction), and corresponding true-labels & IDs.

    For predictions, we pick the label (0 or 1) with the higher score. The output for each batch are a 2-column ndarray (one column for "0"
    and one column for "1"). Pick the label with the highest value and turn this in to a list of 0s and 1s.
    """
    model.eval() # put model in evaluation mode

    predictions, prob_fiction, true_labels, IDs = [], [], [], []

    for batch in data_loader:
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_IDs = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()
        ids = b_IDs.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(labels)
        IDs.append(ids)


    flat_predictions = np.concatenate(predictions, axis=0)

    probs = torch.nn.functional.softmax(torch.from_numpy(flat_predictions), dim=-1) # convert logits to probabilities
    prob_fiction = probs[:,1] # because order is [0,1] and 1 is fiction
    prob_fiction = prob_fiction.numpy()

    flat_predictions = np.argmax(flat_predictions, axis=1).flatten() # pick the one with the highest value

    flat_true_labels = np.concatenate(true_labels, axis=0)
    flat_IDs = np.concatenate(IDs, axis=0)

    return flat_predictions, prob_fiction, flat_true_labels, flat_IDs

# If there's a GPU available...
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [8]:
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score #, precision_score, recall_score, accuracy_score, average_precision_score

def run_bert():
    """
    Runs the BERT model:
    1) Prepares data loaders.
    2) Fine-tunes the BERT model.
    3) Returns the predictions on the test set.
    """
    # DataLoader:
    train_dataloader = prepare_dataloader(texts=X_train, labels=labels_train)
    
    print("Beginning training now..")
    # Train/fine-tune:
    bert_model = train(train_dataloader)

    # Predict on test set:
    test_dataloader = prepare_dataloader(texts=X_test, labels=labels_test, IDs=testIDs_idx)
    predictions, prob_fiction, true_labels, IDs_idx = predict(bert_model, test_dataloader)
    print("Predictions: {}\n\nLabels:{}\n\nIDs_idx:{}".format(predictions, true_labels, IDs_idx))
    print("\n\n\n\nF1=", f1_score(true_labels, predictions, pos_label=1))
    write_predictions(IDs_idx, prob_fiction, predictions)



def write_predictions(IDs_idx, prob_fiction, predictions):
    # Save predictions:
    print("Write predictions to:", preds_path)

    with open(preds_path, 'w') as f:
        f.write('fname\tprobability_fiction\tprediction\n')
        for index, prob, pred in zip(IDs_idx, prob_fiction, predictions):
            ID = test_IDs[int(index)]

            if prob >= 0.5:
                f.write(ID+'\t'+str(prob)+'\tfic\n')
                assert pred == 1
            else:
                f.write(ID+'\t'+str(prob)+'\tnon\n')
                assert pred == 0


def labels_str_to_int(Y):
    """
    Given the input labels, it converts them to integeres (fiction: 1 | non-fiction: 0)
    """
    labels = []
    for l in Y:
        if l == 'fic':
            labels.append(1)
        elif l == 'non':
            labels.append(0)
        else:
            print("Error:", l)
    return labels

In [13]:
MALE = 100

female = 100 - MALE
preds_path = HOME_PATH + 'BERT_predictions_Male_'+str(MALE)+'.tsv'
print("Write predictions to:", preds_path)


print("Running BERT for Male: {}% and Female: {}%".format(MALE, female))
X_train, Y_train, train_IDs = load_train_data(male_pct=int(MALE)/100, return_ids=True)
t = [i[:5] for i in train_IDs]
print("X_train: {} | Y_train: {} | Y Distribution: {} | Gender Dist: {}".format(len(X_train), len(Y_train), Counter(Y_train), Counter(t)))
assert len(X_train) == len(Y_train) == 400

X_train = X_train.tolist(); Y_train = Y_train.tolist() # convert to list
labels_train = labels_str_to_int(Y_train) # convert labels to integers

# Test data:
X_test, Y_test, test_IDs = load_test_data()
t = [i[:5] for i in test_IDs]
print("Test Set ---- X: {} | Y: {} | Distribution: {} | Gender dist in test: {} | Test IDs: {}, preview: {}".format(len(X_test), len(Y_test), Counter(Y_test), Counter(t), len(test_IDs), test_IDs[:3]))
assert len(X_test) == len(Y_test) == 200

X_test = X_test.tolist(); Y_test = Y_test.tolist(); test_IDs = test_IDs.tolist() # convert to list
labels_test = labels_str_to_int(Y_test) # convert labels to integers
testIDs_idx = np.linspace(0, len(test_IDs), len(test_IDs), False) # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs
print("testIDs indexes:", len(testIDs_idx))

run_bert()

Write predictions to: /content/gdrive/My Drive/txtLAB-2020/bert-gender/BERT_predictions_Male_100.tsv
Running BERT for Male: 100% and Female: 0%
Train Fiction fnames: 180 | Train Non-Fiction fnames: 100
Target for Male Fiction: 200.0 | Target for Female Fiction: 0.0

We have 85 male-fiction files and 95 female-fiction files


For MALE: we need 2 passages from <= 55 and 3 passages from 30 files.
For FEMALE: we need 2 passages from <= 95 and 3 passages from 0 files.

Get 3 passages from 30 files: male
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Words2: 500 | Words3: 500
Words1: 500 | Wo



Dataset has input_ids, attention_masks, labels | Length: 400
Input IDs: torch.Size([400, 512])
Dataset size: 400
Beginning training now..


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

	Average training loss: 0.50
	Training epcoh took: 0:00:23
	Average training loss: 0.17
	Training epcoh took: 0:00:23
	Average training loss: 0.10
	Training epcoh took: 0:00:22


Training complete
Total training took 0:01:08 (h:mm:ss)




Dataset has input_ids, attention_masks, labels, and IDs
Input IDs: torch.Size([200, 512])
Dataset size: 200
Predictions: [1 0 1 0 1 1 0 1 0 1 0 1 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 1 0 1 1
 0 1 1 0 0 1 1 0 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0
 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1
 1 0 0 0 0 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1
 1 0 0 1 0 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0 1
 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1]

Labels:[1 0 1 0 1 1 0 1 0 1 0 1 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 1 0 1 1
 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0
 0 1 1 0 0 0 1 0 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1
 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1
 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 0 1 0 1 0 0 1
 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0]

IDs_idx:[ 65. 139.  84. 152.  89.  50. 177.  10. 146.   