#From: BERT Fine-Tuning Tutorial with PyTorch
By Chris McCormick and Nick Ryan

*Revised on March 20, 2020 - Switched to `tokenizer.encode_plus` and added validation loss. See [Revision History](https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=IKzLS9ohzGVu) at the end for details.*




In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device. 

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [3]:
!pip install transformers



## 2.1. Download & Extract

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
import pandas as pd
raw_df=pd.read_excel('/content/gdrive/Shared drives/The Chinese economy project/Factiva data/BERT_Code/English Complete Dataset.xlsx')
raw_df = raw_df[raw_df.Relevance<= 1]
raw_df = raw_df[raw_df['Article sentiment']<= 3]
raw_df.head()


Unnamed: 0,Year,Sample No.,Relevance,Headline sentiment,First para. number,First para. sentiment,Last para. number,Last para. sentiment,Article sentiment,UniqueID,Headline,First Paragraph,Last Paragraph,Content
1,2000,2,1,1,1.0,3,9.0,3,3,2,Clinton vows accord on Chinese WTO entry,"DAVOS, Switzerland, Jan 29 (AFP) - US Presiden...","""China and Russia both are still going through...",Clinton vows accord on Chinese WTO entry\nDAVO...
3,2000,4,1,3,1.0,3,8.0,1,3,4,Communist China urges middle class to employ m...,"BEIJING, Nov 24 (AFP) - China's Communist gove...",The government is set to boost the household m...,Communist China urges middle class to employ m...
4,2000,5,1,1,1.0,1,5.0,1,1,5,Chinese exports seen surging 23 percent in 2000,"BEIJING, Dec 8 (AFP) - China's annual export v...",The MOFTEC report based its prediction on a gr...,Chinese exports seen surging 23 percent in 200...
5,2000,6,1,3,1.0,3,13.0,4,3,6,"China, Nepal discuss tourism, steer clear of r...","BEIJING, Aug 25 (AFP) - China and Nepal in for...","Some 500,000 mostly western tourists visited N...","China, Nepal discuss tourism, steer clear of r..."
9,2000,10,1,3,1.0,3,20.0,3,3,10,Chinese premier meets EU trade chief as WTO ta...,"BEIJING, May 19 (AFP) - The European Union's t...",Other important issues are insurance licences ...,Chinese premier meets EU trade chief as WTO ta...


In [6]:
# filter out articles with only positive and negative in all content
import pandas as pd
second_df=pd.read_excel('/content/gdrive/Shared drives/The Chinese economy project/Factiva data/BERT_Code/English Complete Dataset.xlsx')
second_df = second_df[second_df.Relevance<= 1]
second_df = second_df[second_df['Article sentiment']<= 3]
second_df = second_df[second_df['Headline sentiment']<= 3]
second_df = second_df[second_df['First para. sentiment']<= 3]
second_df = second_df[second_df['Last para. sentiment']<= 3]
second_df.head()



Unnamed: 0,Year,Sample No.,Relevance,Headline sentiment,First para. number,First para. sentiment,Last para. number,Last para. sentiment,Article sentiment,UniqueID,Headline,First Paragraph,Last Paragraph,Content
1,2000,2,1,1,1.0,3,9.0,3,3,2,Clinton vows accord on Chinese WTO entry,"DAVOS, Switzerland, Jan 29 (AFP) - US Presiden...","""China and Russia both are still going through...",Clinton vows accord on Chinese WTO entry\nDAVO...
3,2000,4,1,3,1.0,3,8.0,1,3,4,Communist China urges middle class to employ m...,"BEIJING, Nov 24 (AFP) - China's Communist gove...",The government is set to boost the household m...,Communist China urges middle class to employ m...
4,2000,5,1,1,1.0,1,5.0,1,1,5,Chinese exports seen surging 23 percent in 2000,"BEIJING, Dec 8 (AFP) - China's annual export v...",The MOFTEC report based its prediction on a gr...,Chinese exports seen surging 23 percent in 200...
9,2000,10,1,3,1.0,3,20.0,3,3,10,Chinese premier meets EU trade chief as WTO ta...,"BEIJING, May 19 (AFP) - The European Union's t...",Other important issues are insurance licences ...,Chinese premier meets EU trade chief as WTO ta...
10,2000,11,1,1,1.0,1,19.0,3,3,11,Landmark WTO deal to strengthen China's push t...,"SHANGHAI, May 19 (AFP) - The Sino-EU trade dea...",China has yet to seal bilateral trade deals wi...,Landmark WTO deal to strengthen China's push t...


In [7]:
len(raw_df)

1705

In [9]:
# take out articles for majority method testing; save articles method
import numpy as np
np.random.seed(0)

second_df = second_df.dropna()

remove_n = len(raw_df) // 5

drop_indices = np.random.choice(second_df.index, remove_n, replace=False)

test_dataframe = raw_df.filter(items = drop_indices, axis= 0)

new_raw_df = raw_df.drop(drop_indices)

In [11]:
#Augment Data Method
#raw_df = raw_df.dropna()

df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()

df1['Sentiment'] = new_raw_df['Headline sentiment']
df2['Sentiment'] = new_raw_df['First para. sentiment']
df3['Sentiment'] = new_raw_df['Last para. sentiment']
df4['Sentiment'] = new_raw_df['Article sentiment']


df1['Content'] = new_raw_df['Headline']
df2['Content'] = new_raw_df['First Paragraph']
df3['Content'] = new_raw_df['Last Paragraph']
df4['Content'] = new_raw_df['Content']

df = pd.concat([df1, df2, df3, df4], ignore_index= True).dropna()
#df = df[(df.Sentiment<= 2) & (df.Sentiment> 0)] #positive vs negative
df = df[(df.Sentiment<= 3) & (df.Sentiment> 0)]  # neutral vs non-neutral


In [12]:
len(df)

4595

In [13]:
#downsampling 

# method 1 https://stackoverflow.com/questions/28556942/pandas-remove-rows-at-random-without-shuffling-dataset
import numpy as np
np.random.seed(0)

remove_n = 1715

new_drop_indices = np.random.choice(df[df['Sentiment'] == 3].index, remove_n, replace=False)

df = df.drop(new_drop_indices)


In [14]:
df['Sentiment'] = df['Sentiment'].replace(2,0)
df['Sentiment'] = df['Sentiment'].replace(1,2)
df['Sentiment'] = df['Sentiment'].replace(3,1)




df.groupby('Sentiment').count()


Unnamed: 0_level_0,Content
Sentiment,Unnamed: 1_level_1
0,949
1,950
2,981


In [15]:
df.head()

Unnamed: 0,Sentiment,Content
0,2,Clinton vows accord on Chinese WTO entry
1,1,"China, Nepal discuss tourism, steer clear of r..."
3,2,Landmark WTO deal to strengthen China's push t...
5,1,China calls for flexibility in grinding WTO ta...
7,2,Clinton signs landmark China trade bill


In [17]:
allsentences=df['Content'].values
alllabels = df['Sentiment'].values

from sklearn.model_selection import train_test_split
sentences, test_sentences, labels, test_labels = train_test_split(allsentences, alllabels, test_size=0.1, random_state=10, shuffle=True, stratify=alllabels)


In [18]:
sentences = np.concatenate([sentences, test_sentences])
labels = np.concatenate([labels, test_labels])

In [21]:
len(labels)

2880

In [22]:
df['Content'].values

array(['Clinton vows accord on Chinese WTO entry',
       'China, Nepal discuss tourism, steer clear of religion',
       "Landmark WTO deal to strengthen China's push towards reform by Rachel",
       ...,
       'Chinese tourism blow as PM slams racism\nAustralian Chamber of Tourism chairman John Hart says the Chin\xadese tourism market, which contributes $711m a week on average to the economy, has effectively “come to a dead stop” after the corona\xadvirus outbreak prompted the government to shut its borders.\nMr Hart said the loss of the Chin\xadese market was a double blow after a horror bushfire season that had impacted on international tourism, as imagines of the inferno were broadcast across the world.It comes after the Chief Medical Officer raised concerns on Tuesday about xenophobia and racial profiling in Australia as a result\xad of the outbreak, and Scott Morrison and Anthony Albanese both urged Australians to “stand up and speak out” against racism after restaurants and s

# 3. Tokenization & Input Formatting

In this section, we'll transform our dataset into the format that BERT can be trained on.

## 3.1. BERT Tokenizer


To feed our text to BERT, it must be split into tokens, and then these tokens must be mapped to their index in the tokenizer vocabulary.

The tokenization must be performed by the tokenizer included with BERT--the below cell will download this for us. We'll be using the "uncased" version here.


In [23]:
!pip install sentencepiece



In [24]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


Let's apply the tokenizer to one sentence just to see the output.


In [26]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  Economic slowdown and anti-graft campaign deter purchases of imported cars and jewellery
Tokenized:  ['economic', 'slow', '##down', 'and', 'anti', '-', 'graf', '##t', 'campaign', 'deter', 'purchases', 'of', 'imported', 'cars', 'and', 'jewellery']
Token IDs:  [3171, 4030, 7698, 1998, 3424, 1011, 22160, 2102, 3049, 28283, 17402, 1997, 10964, 3765, 1998, 21545]


When we actually convert all of our sentences, we'll use the `tokenize.encode` function to handle both steps, rather than calling `tokenize` and `convert_tokens_to_ids` separately. 

Before we can do that, though, we need to talk about some of BERT's formatting requirements.

## 3.2. Required Formatting

The above code left out a few required formatting steps that we'll look at here.

*Side Note: The input format to BERT seems "over-specified" to me... We are required to give it a number of pieces of information which seem redundant, or like they could easily be inferred from the data without us explicity providing it. But it is what it is, and I suspect it will make more sense once I have a deeper understanding of the BERT internals.*

We are required to:
1. Add special tokens to the start and end of each sentence.
2. Pad & truncate all sentences to a single constant length.
3. Explicitly differentiate real tokens from padding tokens with the "attention mask".



### Special Tokens



**`[SEP]`**

At the end of every sentence, we need to append the special `[SEP]` token. 

This token is an artifact of two-sentence tasks, where BERT is given two separate sentences and asked to determine something (e.g., can the answer to the question in sentence A be found in sentence B?). 

I am not certain yet why the token is still required when we have only single-sentence input, but it is!


**`[CLS]`**

For classification tasks, we must prepend the special `[CLS]` token to the beginning of every sentence.

This token has special significance. BERT consists of 12 Transformer layers. Each transformer takes in a list of token embeddings, and produces the same number of embeddings on the output (but with the feature values changed, of course!).

![Illustration of CLS token purpose](https://drive.google.com/uc?export=view&id=1ck4mvGkznVJfW3hv6GUqcdGepVTOx7HE)

On the output of the final (12th) transformer, *only the first embedding (corresponding to the [CLS] token) is used by the classifier*.

>  "The first token of every sequence is always a special classification token (`[CLS]`). The final hidden state
corresponding to this token is used as the aggregate sequence representation for classification
tasks." (from the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

You might think to try some pooling strategy over the final embeddings, but this isn't necessary. Because BERT is trained to only use this [CLS] token for classification, we know that the model has been motivated to encode everything it needs for the classification step into that single 768-value embedding vector. It's already done the pooling for us!



### Sentence Length & Attention Mask



The sentences in our dataset obviously have varying lengths, so how does BERT handle this?

BERT has two constraints:
1. All sentences must be padded or truncated to a single, fixed length.
2. The maximum sentence length is 512 tokens.

Padding is done with a special `[PAD]` token, which is at index 0 in the BERT vocabulary. The below illustration demonstrates padding out to a "MAX_LEN" of 8 tokens.

<img src="https://drive.google.com/uc?export=view&id=1cb5xeqLu_5vPOgs3eRnail2Y00Fl2pCo" width="600">

The "Attention Mask" is simply an array of 1s and 0s indicating which tokens are padding and which aren't (seems kind of redundant, doesn't it?!). This mask tells the "Self-Attention" mechanism in BERT not to incorporate these PAD tokens into its interpretation of the sentence.

The maximum length does impact training and evaluation speed, however. 
For example, with a Tesla K80:

`MAX_LEN = 128  -->  Training epochs take ~5:28 each`

`MAX_LEN = 64   -->  Training epochs take ~2:57 each`







## 3.3. Tokenize Dataset

The transformers library provides a helpful `encode` function which will handle most of the parsing and data prep steps for us.

Before we are ready to encode our text, though, we need to decide on a **maximum sentence length** for padding / truncating to.

The below cell will perform one tokenization pass of the dataset in order to measure the maximum sentence length.

In [27]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (840 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  3342


Just in case there are some longer test sentences, I'll set the maximum length to 64.


Now we're ready to perform the real tokenization.

The `tokenizer.encode_plus` function combines multiple steps for us:

1. Split the sentence into tokens.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.
4. Pad or truncate all sentences to the same length.
5. Create the attention masks which explicitly differentiate real tokens from `[PAD]` tokens.

The first four features are in `tokenizer.encode`, but I'm using `tokenizer.encode_plus` to get the fifth item (attention masks). Documentation is [here](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus).


In [28]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  Economic slowdown and anti-graft campaign deter purchases of imported cars and jewellery
Token IDs: tensor([  101,  3171,  4030,  7698,  1998,  3424,  1011, 22160,  2102,  3049,
        28283, 17402,  1997, 10964,  3765,  1998, 21545,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

## 3.4. Training & Validation Split


Divide up our training set to use 90% for training and 10% for validation.

In [29]:
import numpy as np
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

from sklearn.model_selection import train_test_split

train_idx, valid_idx= train_test_split(np.arange(len(labels)), test_size=0.1, shuffle=True, stratify=labels,random_state=10)


# Divide the dataset by randomly selecting samples.
#train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(len(train_idx)))
print('{:>5,} validation samples'.format(len(valid_idx)))

2,592 training samples
  288 validation samples


We'll also create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [31]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 8

train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)

train_dataloader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
validation_dataloader = DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler)

# 4. Train Our Classification Model

Now that our input data is properly formatted, it's time to fine tune the BERT model. 

## 4.1. BertForSequenceClassification

For this task, we first want to modify the pre-trained BERT model to give outputs for classification, and then we want to continue training the model on our dataset until that the entire model, end-to-end, is well-suited for our task. 

Thankfully, the huggingface pytorch implementation includes a set of interfaces designed for a variety of NLP tasks. Though these interfaces are all built on top of a trained BERT model, each has different top layers and output types designed to accomodate their specific NLP task.  

Here is the current list of classes provided for fine-tuning:
* BertModel
* BertForPreTraining
* BertForMaskedLM
* BertForNextSentencePrediction
* **BertForSequenceClassification** - The one we'll use.
* BertForTokenClassification
* BertForQuestionAnswering

The documentation for these can be found under [here](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html).



We'll be using [BertForSequenceClassification](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#bertforsequenceclassification). This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. As we feed input data, the entire pre-trained BERT model and the additional untrained classification layer is trained on our specific task. 


OK, let's load BERT! There are a few different pre-trained BERT models available. "bert-base-uncased" means the version that has only lowercase letters ("uncased") and is the smaller version of the two ("base" vs "large").

The documentation for `from_pretrained` can be found [here](https://huggingface.co/transformers/v2.2.0/main_classes/model.html#transformers.PreTrainedModel.from_pretrained), with the additional parameters defined [here](https://huggingface.co/transformers/v2.2.0/main_classes/configuration.html#transformers.PretrainedConfig).

In [32]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## 4.2. Optimizer & Learning Rate Scheduler

Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend choosing from the following values (from Appendix A.3 of the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)):

>- **Batch size:** 16, 32  
- **Learning rate (Adam):** 5e-5, 3e-5, 2e-5  
- **Number of epochs:** 2, 3, 4 

We chose:
* Batch size: 32 (set when creating our DataLoaders)
* Learning rate: 2e-5
* Epochs: 4 (we'll see that this is probably too many...)

The epsilon parameter `eps = 1e-8` is "a very small number to prevent any division by zero in the implementation" (from [here](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/)).

You can find the creation of the AdamW optimizer in `run_glue.py` [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L109).

In [35]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


In [36]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 5

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## 4.3. Training Loop

Below is our training loop. There's a lot going on, but fundamentally for each pass in our loop we have a trianing phase and a validation phase. 

> *Thank you to [Stas Bekman](https://ca.linkedin.com/in/stasbekman) for contributing the insights and code for using validation loss to detect over-fitting!*

**Training:**
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Clear out the gradients calculated in the previous pass. 
    - In pytorch the gradients accumulate by default (useful for things like RNNs) unless you explicitly clear them out.
- Forward pass (feed input data through the network)
- Backward pass (backpropagation)
- Tell the network to update parameters with optimizer.step()
- Track variables for monitoring progress

**Evalution:**
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Forward pass (feed input data through the network)
- Compute loss on our validation data and track variables for monitoring progress

Pytorch hides all of the detailed calculations from us, but we've commented the code to point out which of the above steps are happening on each line. 

> *PyTorch also has some [beginner tutorials](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py) which you may also find helpful.*

Define a helper function for calculating accuracy.

In [37]:
import numpy as np
from sklearn.metrics import f1_score

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

Helper function for formatting elapsed times as `hh:mm:ss`


In [38]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


We're ready to kick off the training!

In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 38
best_accuracy = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # In PyTorch, calling `model` will in turn call the model's `forward` 
        # function and pass down the arguments. The `forward` function is 
        # documented here: 
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        # The results are returned in a results object, documented here:
        # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.SequenceClassifierOutput
        # Specifically, we'll get the loss (because we provided labels) and the
        # "logits"--the model outputs prior to activation.
        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    if avg_val_accuracy > best_accuracy:
      best_accuracy=avg_val_accuracy
      torch.save(model.state_dict(), 'best')
  
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


Let's view the summary of the training process.

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

# 5. Performance On Test Set

Now we'll load the holdout dataset and prepare inputs just as we did with the training set. Then we'll evaluate predictions using [Matthew's correlation coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html) because this is the metric used by the wider NLP community to evaluate performance on CoLA. With this metric, +1 is the best score, and -1 is the worst score. This way, we can see how well we perform against the state of the art models for this specific task.

### 5.1. Data Preparation



We'll need to apply all of the same steps that we did for the training data to prepare our test data set.

## 5.2. Evaluate on Test Set



With the test set prepared, we can apply our fine-tuned model to generate predictions on the test set.

In [None]:
#sentences = test_dataframe['Headline'].values
#labels = test_dataframe['Headline sentiment'].values

In [None]:
#test_dataframe = raw_df.filter(items = drop_indices, axis= 0)

In [None]:
test_dataframe

In [None]:
test_dataframe.groupby('Article sentiment').count()

In [None]:
columns = ['Headline sentiment', 'First para. sentiment', 'Last para. sentiment', 'Article sentiment']
for col in columns:
  test_dataframe[col] = test_dataframe[col].replace(2,0)
  test_dataframe[col] = test_dataframe[col].replace(1,2)
  test_dataframe[col] = test_dataframe[col].replace(3,1)
  #print(test_dataframe.groupby(col).count())

In [None]:
test_dataframe = test_dataframe[test_dataframe['First para. sentiment']<=2]
test_dataframe = test_dataframe[test_dataframe['Last para. sentiment']<=2]
test_dataframe = test_dataframe[test_dataframe['Headline sentiment']<=2]
test_dataframe

In [None]:
test_dataframe.groupby('Article sentiment').count()

In [None]:
import numpy as np
np.random.seed(0)

test_dataframe = test_dataframe.dropna()

remove_n = 150

drop_indices = np.random.choice(test_dataframe[test_dataframe['Article sentiment']==1].index, remove_n, replace=False)

test_dataframe = test_dataframe.drop(drop_indices)

np.random.seed(0)

remove_n = 10

drop_indices = np.random.choice(test_dataframe[test_dataframe['Article sentiment']==2].index, remove_n, replace=False)

test_dataframe = test_dataframe.drop(drop_indices)

In [None]:
test_dataframe.groupby('Article sentiment').count()

# Majority Method

In [None]:
import pandas as pd
import numpy as np

# Create sentence and label lists
columnsSent = ['Headline sentiment', 'First para. sentiment', 'Last para. sentiment', 'Article sentiment']
columnsContent = ['Headline', 'First Paragraph', 'Last Paragraph', 'Content']

allPredictions = []
allProbabilities = []

for index in range(len(columnsSent)):
  sentences = test_dataframe[columnsContent[index]].values
  labels = test_dataframe[columnsSent[index]].values

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []

  # For every sentence...
  for sent in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 512,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.    
      input_ids.append(encoded_dict['input_ids'])
      
      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)

  # Set the batch size.  
  batch_size = 8

  # Create the DataLoader.
  prediction_data = TensorDataset(input_ids, attention_masks, labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  # Prediction on test set

  print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
  model.load_state_dict(torch.load('best'))
  model.cuda()
  # Put model in evaluation mode
  model.eval()

  # Tracking variables 
  predictions , true_labels, probabilities = [], [], []

  # Predict 
  total_eval_accuracy=0
  idx=0
  for batch in prediction_dataloader:
    idx=idx+1
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions.
        result = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask,
                      return_dict=True)

    logits = result.logits


    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels

    print(np.exp(tf.nn.log_softmax(logits, axis=1)))
    print(np.argmax(logits, axis=1).flatten())

    predictions.append(logits)
    true_labels.append(label_ids)
    probabilities.append(np.exp(tf.nn.log_softmax(logits, axis=1)))
    total_eval_accuracy += flat_accuracy(logits, label_ids)

  allPredictions.append([np.argmax(prediction, axis=1).flatten() for prediction in predictions])
  allProbabilities.append(probabilities)

  print('    DONE.')
  print(total_eval_accuracy/idx)

In [None]:
# from https://stackoverflow.com/questions/52885949/how-to-handle-no-unique-mode-found-2-equally-common-values-in-below-function
import collections

def get_all_modes(a):
    c = collections.Counter(a)  
    mode_count = max(c.values())
    mode = [key for key, count in c.items() if count == mode_count]
    return mode

In [None]:
#import statistics

allPredictions
finalPredictions = []
#finalPredictions.append(statistics.mode(
for batchNum in range(len(allPredictions[0])):
  for sampleNum in range(len(allPredictions[0][batchNum])):
      modes = get_all_modes([allPredictions[0][batchNum][sampleNum], allPredictions[1][batchNum][sampleNum], allPredictions[2][batchNum][sampleNum], allPredictions[3][batchNum][sampleNum]])
      print(modes)
      if len(modes) > 1:
        print(modes[0])
        one = allProbabilities[0][batchNum][sampleNum][modes[0]] + allProbabilities[1][batchNum][sampleNum][modes[0]] + allProbabilities[2][batchNum][sampleNum][modes[0]] + allProbabilities[3][batchNum][sampleNum][modes[0]]
        two = allProbabilities[0][batchNum][sampleNum][modes[1]] + allProbabilities[1][batchNum][sampleNum][modes[1]] + allProbabilities[2][batchNum][sampleNum][modes[1]] + allProbabilities[3][batchNum][sampleNum][modes[1]]
        if one > two:
          finalPredictions.append(modes[0])
        else:
          finalPredictions.append(modes[1])
        print(one,two)
      else:
        print(modes[0])
        finalPredictions.append(modes[0])


In [None]:
len(finalPredictions)

In [None]:
label_values = test_dataframe['Article sentiment'].values
Total_accuracy = np.sum(finalPredictions == label_values) / len(label_values)
print('Majority Method Accuracy:', Total_accuracy)

In [None]:
total_eval_accuracy = 0
for item in range(len(predictions)):
  total_eval_accuracy += flat_accuracy(predictions[item], true_labels[item])
  print(total_eval_accuracy)
print(total_eval_accuracy/idx)

# Regular Accuracy

In [None]:
import pandas as pd

# Create sentence and label lists
sentences = test_dataframe['Content'].values
labels = test_dataframe['Article sentiment'].values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 8

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
model.load_state_dict(torch.load('best'))
model.cuda()
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
total_eval_accuracy=0
idx=0
for batch in prediction_dataloader:
  idx=idx+1
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
  total_eval_accuracy += flat_accuracy(logits, label_ids)


print('    DONE.')
print('Regular accuracy:', total_eval_accuracy/idx)

In [None]:
# How many predicted are actually correct
def precision(ground_truth, pred_labels, target):
  total = 0
  score = 0
  for num in range(len(ground_truth)):
    if pred_labels[num] == target:
      total += 1
      if ground_truth[num] == target:
        score += 1
  return score/total

In [None]:
# How many out of test set were actually rememebered
def recall(ground_truth, pred_labels, target):
  total = 0
  score = 0
  for num in range(len(ground_truth)):
    if ground_truth[num] == target:
      total += 1
      if pred_labels[num] == target:
        score += 1
  return score/total

In [None]:
# Metric to measure precision against recall
def f1_score(ground_truth, pred_labels, target):
  prec = precision(ground_truth, pred_labels, target)
  rec = recall(ground_truth, pred_labels, target)
  return 2 * prec * rec / (prec+rec)

In [None]:
precision(label_values, finalPredictions, 1)

In [None]:
recall(label_values, finalPredictions, 1)

In [None]:
f1_score(label_values, finalPredictions, 1)

# Performance Metrics

In [None]:
from sklearn.metrics import classification_report

target_names = ['negative', 'neutral', 'positive']

print(classification_report(label_values, finalPredictions, target_names=target_names))
              

In [None]:
all=[]
for x in predictions:
  pred_flat = np.argmax(x, axis=1).flatten()
  for a in pred_flat:
    all.append(a)

len(all)
true=[]
for x in true_labels:
  for a in x:
    true.append(a)
true
len(true)
#f1_score(true,all,average='micro')

from sklearn.metrics import classification_report

target_names = ['negative', 'neutral', 'positive']

print(classification_report(true, all, target_names=target_names))
              

In [None]:
#model_save_name1 = 'SentimentML128LR2EPS1EPOCH10.zip'
#path = F'/content/gdrive/Shared drives/The Chinese economy project/Factiva data/BERT_Code/EnglishModels/{model_save_name1}'
#torch.save(model.state_dict(), path)

# Loughran McDonald Accuracy

In [None]:
import collections, numpy
collections.Counter(label_values)

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')

sentimentTable = pd.read_csv('/content/gdrive/Shared drives/The Chinese economy project/Factiva data/BERT_Code/LoughranMcDonald_MasterDictionary_2020.csv')
sentimentTable['Word'] = sentimentTable['Word'].str.lower()

In [None]:
sentimentTable

In [None]:
sentimentTable[sentimentTable['Word'] == nltk.word_tokenize(test_sentences[0])[2].lower()]['Negative'].values[0]

In [None]:
results = []
test_sentences = test_dataframe['Content'].values
test_labels = test_dataframe['Article sentiment'].values

for sentIndex in range(len(test_sentences)):
  current = nltk.word_tokenize(test_sentences[sentIndex])
  total = 0
  for word in current:
      table = sentimentTable[sentimentTable['Word']==word.lower()]
      if len(table['Positive'].values) != 0:
        if table['Positive'].values[0] > 0:
          total += 1
        if table['Negative'].values[0] > 0:
          total -= 1
  sentAverage = total/len(current)
  if sentAverage > 0 and test_labels[sentIndex] == 2:
      results += [1]
  elif sentAverage < 0 and test_labels[sentIndex] == 0:
      results += [1]
  elif sentAverage == 0 and test_labels[sentIndex] == 1:
      results += [1]
  else:
      results += [0]





In [None]:
print("Accuracy of Loughran McDonald method:", sum(results)/len(results))

# Conclusion

This post demonstrates that with a pre-trained BERT model you can quickly and effectively create a high quality model with minimal effort and training time using the pytorch interface, regardless of the specific NLP task you are interested in.