In [1]:
!pip install transformers



In [2]:
# Mount the google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATASET_FOLDER_PATH = '/content/drive/MyDrive/English COVID Tweet Classification/data/english/'
ROUGH_TRAIN_DATASET_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/rough_train_dataset.csv'
ROUGH_VAL_DATASET_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/rough_val_dataset.csv'
CLEAN_TRAIN_DATASET_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/combined_clean_train_dataset.csv'
CLEAN_VAL_DATASET_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/clean_val_dataset.csv'
CLEAN_TEST_DATASET_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/clean_test_dataset.csv'
TRAIN_EPOCHS = 20
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 1e-5
MODEL_FILE_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/model_roberta-base.bin'
OUTPUT_ANSWER_FILE_PATH = '/content/drive/My Drive/English COVID Tweet Classification/data/english/answer.txt'
ANSWERS_FOLDER_OUTPUT = '/content/drive/My Drive/English COVID Tweet Classification/data/english/answers/'

In [4]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex
!pip install sentencepiece

FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo


In [5]:
import torch
import collections
import pandas as pd

from tqdm import tqdm
from sklearn import metrics
from bertviz import model_view
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, RobertaModel, AlbertPreTrainedModel, RobertaTokenizer

In [6]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [7]:
class RobertaForBinaryClassification(AlbertPreTrainedModel):

    def __init__(self, model_config, num_labels, device):
        super(RobertaForBinaryClassification, self).__init__(model_config)
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        model_config.num_labels = num_labels
        self.num_labels = num_labels
        self.model = RobertaModel.from_pretrained("roberta-base", output_attentions=True).to(device)
        self.dropout = torch.nn.Dropout(model_config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(model_config.hidden_size, num_labels)
        self.loss_fn = BCEWithLogitsLoss()
        self.apply(self._init_weights)

    def forward(self, text_batch, labels, device, is_visualization=False):
        if labels is not None:
            labels = labels.to(device)

        if not is_visualization:
            inputs = self.tokenizer(text_batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
            model_inputs = {}
            for key, val in inputs.items():
                model_inputs[key] = val.to(device)
        else:
          inputs = text_batch

        inputs = inputs.to(device)
        attention = None
        if is_visualization:
            outputs = self.model(inputs)
            attention = outputs[-1]
        else:
            outputs = self.model(**model_inputs)
        
        output_1 = outputs[0][:, 0]
        output_2 = self.dropout(output_1)
        logits = self.classifier(output_2)

        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss, logits
        else:
            return None, logits, attention

In [8]:
def show_head_view(device, model, tokenizer, sentence_a, sentence_b=None):
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        attention = model(sentence_a, device=device, labels=None, is_visualization=True)[-1]
        token_type_ids = model(sentence_a, device=device, labels=None)[3]
        sentence_b_start = token_type_ids[0].tolist().index(1)
    else:
        attention = model(input_ids, device=device, labels=None, is_visualization=True)[-1]
        sentence_b_start = None
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)  
    model_view(attention, tokens)

In [9]:
torch.backends.cudnn.enabled = False
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model_config = AutoModel.from_pretrained("roberta-base").config
model = RobertaForBinaryClassification(model_config, 1, device)
model.load_state_dict(torch.load(DATASET_FOLDER_PATH + 'model_roberta-base.bin'))
model = model.to(device)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [10]:
# Fake - Real (Actual - Prediction) RobertaVisualizationFakeReal.png
sentence_a = 'Gargling by salt water and inhaling hot water cures COVID19.'
call_html()
show_head_view(device, model, tokenizer, sentence_a, None)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
# Real - Fake (Actual - Prediction) RobertaVisualizationRealFake.png
sentence_a = 'For more information on symptoms of COVID-19 and what to watch for visit: _URL_.'
call_html()
show_head_view(device, model, tokenizer, sentence_a, None)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# Fake - Fake (Actual - Prediction) RobertaVisualizationFakeFake.png
sentence_a = 'Bill Gates predicted coronavirus.'
call_html()
show_head_view(device, model, tokenizer, sentence_a, None)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# Real - Real (Actual - Prediction) RobertaVisualizationRealReal.png
sentence_a = 'States reported ~22k new cases in line with the slow drift downward. _URL_'
call_html()
show_head_view(device, model, tokenizer, sentence_a, None)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>