In [1]:
#import necessary libraries
import torch
from transformers import *
import pandas as pd
import re
import collections
import numpy as np
import json
import time
from tqdm.notebook import tqdm
import torch.nn as nn
import pathlib

#output all items, not just last one
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

#set device
if torch.cuda.is_available():
    device= "cuda"
else:
    device = "cpu"  
device

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


'cuda'

In [2]:
# define Articles dataset class for easy sampling, iteration, and weight creating
class Articles(torch.utils.data.Dataset):
    def __init__(self, json_file):
        super().__init__()
        with open(json_file, "r") as data_file:
            self.examples = json.loads(data_file.read())

    def __getitem__(self, idx):
        return self.examples[idx]

    def __len__(self):
        return len(self.examples)

    def tokenize(self):
        for idx, example in enumerate(self.examples):
            self.examples[idx]['text'] = re.findall('[\w]+', self.examples[idx]['text'].lower())

    def create_positive_sampler(self, target_publication):
        prob = np.zeros(len(self))
        for idx, example in enumerate(self.examples):
            if example['model_publication'] == target_publication:
                prob[idx] = 1
        return torch.utils.data.WeightedRandomSampler(weights=prob, num_samples=len(self), replacement=True)

    def create_negative_sampler(self, target_publication):
        prob = np.zeros(len(self))
        for idx, example in enumerate(self.examples):
            if example['model_publication'] != target_publication:
                prob[idx] = 1
        return torch.utils.data.WeightedRandomSampler(weights=prob, num_samples=len(self), replacement=True)

    def map_items(self, word_to_id, url_to_id, publication_to_id, filter=False, min_length=0):
        min_length_articles = []
        for idx, example in enumerate(self.examples):
            self.examples[idx]['text'] = [word_to_id.get(word, len(word_to_id)) for word in example['text']]
            self.examples[idx]['text'] = [word for word in example['text'] if word != len(word_to_id)]
            if filter:
                if len(self.examples[idx]['text']) > min_length:
                    min_length_articles.append(self.examples[idx])
            self.examples[idx]['url'] = url_to_id.get(example['url'], url_to_id.get("miscellaneous"))
            self.examples[idx]['model_publication'] = publication_to_id.get(example['model_publication'], publication_to_id.get("miscellaneous"))
        return min_length_articles


In [2]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [4]:
sentences = "The White House medical unit and Secret Service will evaluate attendees before they're admitted. They'll be required to test negative for the virus on the day of the event, complete a health questionnaire and pass a temperature screening. The sites will be cleaned and sanitized before each event. Trump Victory, the president’s joint fundraising committee, will cover the testing costs. The move comes as Trump expresses his desire for the country to reopen even as medical professionals raise concerns about the potential dangers of doing so. The pandemic death toll passed 100,000 this week. Trump is also dead set on holding the GOP’s August national convention in Charlotte, even as North Carolina officials are raising concerns about safety. The Charlotte area has seen an uptick in coronavirus cases in recent days. Trump has also been itching to resume his trademark rallies, his primary method of connecting with supporters and broadcasting his message. The president’s campaign team has used the events to glean data from attendees which is used to turn out his voters. The president has left the confines of the White House over the past few weeks to hold ostensibly official events in swing states like Arizona and Michigan, where polls have shown him trailing presumptive Democratic nominee Joe Biden. The events have sometimes had the feel of a rally, complete with walk-out music. Trump’s advisers want him to be seen as eager to reopen the country while Democrats push for stay-at-home orders that keep the economy shuttered. Earlier this month, the reelection effort released a one-minute advertisement titled “American Comeback” highlighting the president’s desire to reignite the economy. The spot concluded with Trump’s mantra that he will “make America great again.” The president’s previously busy fundraising schedule came to a halt in March. A planned March 12 fundraiser with GOP megadonor Sheldon Adelson was scrapped, as were a pair of events that month to be headlined by first lady Melania Trump. Even with in-person fundraising events slashed, the president has maintained robust fundraising totals thanks to a massive small-donor operation. Trump’s political machine narrowly outraised Biden in April and has a $187 million cash-on-hand lead over Biden and the Democratic Party. Biden has sworn off in-person fundraisers during the pandemic, instead doing online events from his Delaware home. Couples will need to donate $580,600 to attend the Dallas fundraiser. A single attendee to the New Jersey fundraiser will need to give $250,000. The money will go to Trump Victory, a joint fundraising committee of the Trump campaign, Republican National Committee, and state parties."
print(' Original: ', sentences)

 Original:  The White House medical unit and Secret Service will evaluate attendees before they're admitted. They'll be required to test negative for the virus on the day of the event, complete a health questionnaire and pass a temperature screening. The sites will be cleaned and sanitized before each event. Trump Victory, the president’s joint fundraising committee, will cover the testing costs. The move comes as Trump expresses his desire for the country to reopen even as medical professionals raise concerns about the potential dangers of doing so. The pandemic death toll passed 100,000 this week. Trump is also dead set on holding the GOP’s August national convention in Charlotte, even as North Carolina officials are raising concerns about safety. The Charlotte area has seen an uptick in coronavirus cases in recent days. Trump has also been itching to resume his trademark rallies, his primary method of connecting with supporters and broadcasting his message. The president’s campaign 

In [5]:
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences))

Tokenized:  ['the', 'white', 'house', 'medical', 'unit', 'and', 'secret', 'service', 'will', 'evaluate', 'attendees', 'before', 'they', "'", 're', 'admitted', '.', 'they', "'", 'll', 'be', 'required', 'to', 'test', 'negative', 'for', 'the', 'virus', 'on', 'the', 'day', 'of', 'the', 'event', ',', 'complete', 'a', 'health', 'question', '##naire', 'and', 'pass', 'a', 'temperature', 'screening', '.', 'the', 'sites', 'will', 'be', 'cleaned', 'and', 'san', '##iti', '##zed', 'before', 'each', 'event', '.', 'trump', 'victory', ',', 'the', 'president', '’', 's', 'joint', 'fundraising', 'committee', ',', 'will', 'cover', 'the', 'testing', 'costs', '.', 'the', 'move', 'comes', 'as', 'trump', 'expresses', 'his', 'desire', 'for', 'the', 'country', 'to', 're', '##open', 'even', 'as', 'medical', 'professionals', 'raise', 'concerns', 'about', 'the', 'potential', 'dangers', 'of', 'doing', 'so', '.', 'the', 'pan', '##de', '##mic', 'death', 'toll', 'passed', '100', ',', '000', 'this', 'week', '.', 'trump

In [None]:
re.findall('[\w]+', sentences.lower())

In [3]:
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  Hello. This is just a test sentence. Im not sure why I am still typing text into this area. Crime and humanity. Trying to stop.
Tokenized:  ['hello', '.', 'this', 'is', 'just', 'a', 'test', 'sentence', '.', 'im', 'not', 'sure', 'why', 'i', 'am', 'still', 'typing', 'text', 'into', 'this', 'area', '.', 'crime', 'and', 'humanity', '.', 'trying', 'to', 'stop', '.']
Token IDs:  [7592, 1012, 2023, 2003, 2074, 1037, 3231, 6251, 1012, 10047, 2025, 2469, 2339, 1045, 2572, 2145, 22868, 3793, 2046, 2023, 2181, 1012, 4126, 1998, 8438, 1012, 2667, 2000, 2644, 1012]


In [12]:
train_data = Articles("../data/final-data/debugdata/train_basic.json")
val_data = Articles("../data/final-data/debugdata/eval_basic.json")

In [13]:
for idx in range(len(train_data)):
    train_data.examples[idx]['text'] = tokenizer.tokenize(train_data.examples[idx]['text'])
    if len(train_data.examples[idx]['text']) > 512:
        train_data.examples[idx]['text'] = train_data.examples[idx]['text'][:512]
    train_data.examples[idx]['text'] = tokenizer.encode(
                        train_data.examples[idx]['text'],           
                        add_special_tokens = True, 
                        max_length = 512)
    train_data.examples[idx]['model_publication'] = 1 if train_data.examples[idx]['model_publication'] == 'target' else 0

In [10]:
#Create batches with positive samples in first half and negative examples in second half
class BatchSamplerWithNegativeSamples(torch.utils.data.Sampler):
    def __init__(self, pos_sampler, neg_sampler, batch_size, items):
        self._pos_sampler = pos_sampler
        self._neg_sampler = neg_sampler
        self._items = items
        assert batch_size % 2 == 0, 'Batch size must be divisible by two for negative samples.'
        self._batch_size = batch_size
        
    def __iter__(self):
        batch, neg_batch = [], []
        neg_sampler = iter(self._neg_sampler)
        for pos_idx in self._pos_sampler:
            batch.append(pos_idx)
            neg_idx = pos_idx
            # keep sampling until we get a true negative sample
            while self._items[neg_idx] == self._items[pos_idx]:
                try:
                    neg_idx = next(neg_sampler)
                except StopIteration:
                    neg_sampler = iter(self._neg_sampler)
                    neg_idx = next(neg_sampler)
            neg_batch.append(neg_idx)
            if len(batch) == self._batch_size // 2:
                batch.extend(neg_batch)
                yield batch
                batch, neg_batch = [], []
        return

    def __len__(self):
        return len(self._pos_sampler) // self._batch_size

In [11]:
#define function to return necessary data for dataloader to pass into model
def collate_fn(examples):
    words = []
    articles = []
    labels = []
    publications = []
    for example in examples:
        words.append(example['text'])
        articles.append(example['url'])
        labels.append(example['model_publication'])
        publications.append(example['publication'])
    num_words = [len(x) for x in words]
    words = np.concatenate(words, axis=0)
    word_attributes = torch.tensor(words, dtype=torch.long)
    articles = torch.tensor(articles, dtype=torch.long)
    num_words.insert(0,0)
    num_words.pop(-1)
    attribute_offsets = torch.tensor(np.cumsum(num_words), dtype=torch.long)
    publications = torch.tensor(publications, dtype=torch.long)
    real_labels = torch.tensor(labels, dtype=torch.long)
    return publications, articles, word_attributes, attribute_offsets, real_labels

In [15]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

100%|█████████████████████████████████████████████████████████████████████████████| 433/433 [00:00<00:00, 290581.38B/s]
100%|████████████████████████████████████████████████████████████████| 440473133/440473133 [01:37<00:00, 4499883.22B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element