In [26]:
import torch
import torch.nn as nn
import numpy
from pytorch_transformers import *
import torch.optim as optim
import torch.nn.functional as F

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups

## Classify 3 categories of news articles

In [27]:
#categories = ['talk.politics.guns', 'soc.religion.christian', 'comp.graphics', 'sci.med']
categories = ['soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

#### 1777 total documents between three categories

In [28]:
print(twenty_train.target_names) # news categories used
print(len(twenty_train.data))
print(len(twenty_train.filenames))

['comp.graphics', 'sci.med', 'soc.religion.christian']
1777
1777


#### Sample document

In [29]:
twenty_train.data[2].replace('\n', ' ')



## Format documents for BERT
Add [CLS] at start and [SEP] at end

In [30]:
docs = []
targets = []
#num_docs = len(twenty_train.data)
num_docs = 100
for i in range(num_docs):
    docs.append("[CLS] " + twenty_train.data[i].replace('\n', ' ') + " [SEP]")
    targets.append(twenty_train.target[i])

In [31]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Pad / truncate inputs to be input size of 512 for BERT

In [32]:
inputs = []

max_sequence_length = 512
# only using one sentence
segment_ids = [1] * max_sequence_length
for i in range(len(docs)):
    tokenized_text = tokenizer.tokenize(docs[i])
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    if len(indexed_tokens) <= max_sequence_length:
        padding_length = max_sequence_length - len(indexed_tokens)
        indexed_tokens_padded = indexed_tokens + [0] * padding_length
        input_mask = [1] * len(indexed_tokens) + [0] * padding_length
    else:
        indexed_tokens_padded = [indexed_tokens[0]] + indexed_tokens[1:max_sequence_length-1] + [indexed_tokens[-1]]
        input_mask = [1] * max_sequence_length
    inputs.append([indexed_tokens_padded, input_mask, segment_ids])

inputs_tensor = torch.tensor(inputs)

Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1114 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (736 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (650 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

In [33]:
inputs_tensor.shape

torch.Size([100, 3, 512])

## Build Neural Network Architecture on Top of BERT

In [34]:
class NewsClassifier(nn.Module):
    def __init__(self):
        super(NewsClassifier, self).__init__()
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 3)
        self.dropout = nn.Dropout(p=0.4)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


In [35]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()
news_classifier = NewsClassifier()

In [36]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_classifier.parameters(), lr=0.004)

## Training Loop for 20 Epochs

In [37]:
for epoch in range(20):  # loop over the dataset multiple times
    
    optimizer.zero_grad()
    
    example_tokens_tensor = inputs_tensor[:, 0, :]
    example_input_mask = inputs_tensor[:, 1, :]
    example_segment_ids = inputs_tensor[:, 2, :]
    
    with torch.no_grad():
        encoded_layers, _ = bert_model(example_tokens_tensor, token_type_ids=example_segment_ids, attention_mask=example_input_mask)
    
    # 0th element because we only care about [CLS] token
    classifier_inputs = encoded_layers[:, 0, :]
    
    
    outputs = news_classifier(classifier_inputs)
    targets_tensor = torch.tensor(targets, dtype=torch.long)
    
    loss = criterion(outputs, targets_tensor)
    
    loss.backward()
    optimizer.step()
    
    print("Loss", loss)
    
print('Finished Training')

Loss tensor(1.1066, grad_fn=<NllLossBackward>)
Loss tensor(1.1063, grad_fn=<NllLossBackward>)
Loss tensor(1.0790, grad_fn=<NllLossBackward>)
Loss tensor(1.0720, grad_fn=<NllLossBackward>)
Loss tensor(1.0284, grad_fn=<NllLossBackward>)
Loss tensor(0.8993, grad_fn=<NllLossBackward>)
Loss tensor(0.8857, grad_fn=<NllLossBackward>)
Loss tensor(0.7891, grad_fn=<NllLossBackward>)
Loss tensor(0.7438, grad_fn=<NllLossBackward>)
Loss tensor(0.5641, grad_fn=<NllLossBackward>)
Loss tensor(0.5295, grad_fn=<NllLossBackward>)
Loss tensor(0.4306, grad_fn=<NllLossBackward>)
Loss tensor(0.3056, grad_fn=<NllLossBackward>)
Loss tensor(0.2870, grad_fn=<NllLossBackward>)
Loss tensor(0.2342, grad_fn=<NllLossBackward>)
Loss tensor(0.2577, grad_fn=<NllLossBackward>)
Loss tensor(0.1698, grad_fn=<NllLossBackward>)
Loss tensor(0.1646, grad_fn=<NllLossBackward>)
Loss tensor(0.1321, grad_fn=<NllLossBackward>)
Loss tensor(0.1251, grad_fn=<NllLossBackward>)
Finished Training


## Function to predict new news articles

In [43]:
def predict_news_article(sentence, bert_model):
    marked = "[CLS] " + sentence.replace('\n', ' ') + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    max_sequence_length = 512
    # only using one sentence
    segment_ids = [1] * max_sequence_length
    if len(indexed_tokens) <= max_sequence_length:
        padding_length = max_sequence_length - len(indexed_tokens)
        indexed_tokens_padded = indexed_tokens + [0] * padding_length
        input_mask = [1] * len(indexed_tokens) + [0] * padding_length
    else:
        indexed_tokens_padded = [indexed_tokens[0]] + indexed_tokens[1:max_sequence_length-1] + [indexed_tokens[-1]]
        input_mask = [1] * max_sequence_length
    
    inputs = [[indexed_tokens_padded, input_mask, segment_ids]]
    inputs_tensor = torch.tensor(inputs)
    
    indexed_tokens_padded = inputs_tensor[:, 0, :]
    segment_ids = inputs_tensor[:, 1, :]
    input_mask = inputs_tensor[:, 2, :]
    
    with torch.no_grad():
        encoded_layers, _ = bert_model(indexed_tokens_padded, token_type_ids=segment_ids, attention_mask=input_mask)
        
    classifier_inputs = encoded_layers[:, 0, :]
    
    
    outputs = news_classifier(classifier_inputs)
    return twenty_train.target_names[torch.argmax(outputs).item()]

#### Religious Article Example

"We started with Genesis 1 and God's intentions for his world. The story concluded, 'Then God said, 'Let us make people in our image. He made a man out of the dust of the earth and God breathed his spirit into the man. So Adam became a living being. Later God put Adam to sleep and took one of his ribs and made a wife, Eve, for him. God said, 'Rule over the animals … multiply and fill the earth.' Finally God looked at everything he had made and blessed it. He said, 'It is very good.' On the seventh day God rested from his work because he had completed the work of creation.'"

In [44]:
predict_news_article("We started with Genesis 1 and God's intentions for his world. The story concluded, 'Then God said, 'Let us make people in our image. He made a man out of the dust of the earth and God breathed his spirit into the man. So Adam became a living being. Later God put Adam to sleep and took one of his ribs and made a wife, Eve, for him. God said, 'Rule over the animals … multiply and fill the earth.' Finally God looked at everything he had made and blessed it. He said, 'It is very good.' On the seventh day God rested from his work because he had completed the work of creation.'", bert_model=bert_model)

'soc.religion.christian'

#### Science Article Example

Regulators in the United States have already approved deep brain stimulation for the treatment of Parkinson's disease, epilepsy, essential tremor, and obsessive-compulsive disorder. The treatment involves implanting wires into the brain and a stimulator in the chest or abdomen. The stimulator sends small electrical pulses to the wires along a connection lead under the skin. Doctors sometimes refer to the stimulator as a pacemaker. The surgeons implant the wires into areas of the brain that are responsible for the symptoms of the particular condition. In the case of Parkinson's disease, for example, they implant them into the brain area that controls movement.

In [45]:
predict_news_article("Regulators in the United States have already approved deep brain stimulation for the treatment of Parkinson's disease, epilepsy, essential tremor, and obsessive-compulsive disorder. The treatment involves implanting wires into the brain and a stimulator in the chest or abdomen. The stimulator sends small electrical pulses to the wires along a connection lead under the skin. Doctors sometimes refer to the stimulator as a pacemaker. The surgeons implant the wires into areas of the brain that are responsible for the symptoms of the particular condition. In the case of Parkinson's disease, for example, they implant them into the brain area that controls movement.", bert_model=bert_model)

'sci.med'

#### Computer Graphics Article Example

This paper presents a surface reconstruction algorithm that takes an unoriented point cloud as input and produces an interpolating surface in the form of triangulation. Based on region-growing and Delaunay approaches, this algorithm aims to address the difficulties of reconstruction from point data with imperfections. Starting with a seed triangle from the Delaunay tetrahedron result for input points, the surface is gradually formed by adding the linked Delaunay triangle from the current boundaries one by one. During surface growth, the topology errors and the quantity of the holes generated by adding inappropriate triangles can be reduced by changing the triangle selection criteria and adjusting the addition order of the triangles. We evaluated our method using a wide range of datasets, and this method compares well to popular classic and current algorithms with unoriented input points and triangulated surface output. In addition, to achieve results with a small number of holes on the generated surface, a detection and repair approach is proposed to turn the holes of various shapes into smooth surfaces.

In [46]:
predict_news_article("This paper presents a surface reconstruction algorithm that takes an unoriented point cloud as input and produces an interpolating surface in the form of triangulation. Based on region-growing and Delaunay approaches, this algorithm aims to address the difficulties of reconstruction from point data with imperfections. Starting with a seed triangle from the Delaunay tetrahedron result for input points, the surface is gradually formed by adding the linked Delaunay triangle from the current boundaries one by one. During surface growth, the topology errors and the quantity of the holes generated by adding inappropriate triangles can be reduced by changing the triangle selection criteria and adjusting the addition order of the triangles. We evaluated our method using a wide range of datasets, and this method compares well to popular classic and current algorithms with unoriented input points and triangulated surface output. In addition, to achieve results with a small number of holes on the generated surface, a detection and repair approach is proposed to turn the holes of various shapes into smooth surfaces.", bert_model=bert_model)

'comp.graphics'