In [1]:
#import necessary libraries
import torch
from transformers import *
import pandas as pd
import re
import collections
import numpy as np
import json
import time
from tqdm.notebook import tqdm
import torch.nn as nn
import pathlib

#output all items, not just last one
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

#set device
if torch.cuda.is_available():
    device= "cuda"
else:
    device = "cpu"  
device

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


'cuda'

In [2]:
# define Articles dataset class for easy sampling, iteration, and weight creating
class Articles(torch.utils.data.Dataset):
    def __init__(self, json_file):
        super().__init__()
        with open(json_file, "r") as data_file:
            self.examples = json.loads(data_file.read())

    def __getitem__(self, idx):
        return self.examples[idx]

    def __len__(self):
        return len(self.examples)

    def tokenize(self):
        for idx, example in enumerate(self.examples):
            self.examples[idx]['text'] = re.findall('[\w]+', self.examples[idx]['text'].lower())

    def create_positive_sampler(self, target_publication):
        prob = np.zeros(len(self))
        for idx, example in enumerate(self.examples):
            if example['model_publication'] == target_publication:
                prob[idx] = 1
        return torch.utils.data.WeightedRandomSampler(weights=prob, num_samples=len(self), replacement=True)

    def create_negative_sampler(self, target_publication):
        prob = np.zeros(len(self))
        for idx, example in enumerate(self.examples):
            if example['model_publication'] != target_publication:
                prob[idx] = 1
        return torch.utils.data.WeightedRandomSampler(weights=prob, num_samples=len(self), replacement=True)

    def map_items(self, word_to_id, url_to_id, publication_to_id, filter=False, min_length=0):
        min_length_articles = []
        for idx, example in enumerate(self.examples):
            self.examples[idx]['text'] = [word_to_id.get(word, len(word_to_id)) for word in example['text']]
            self.examples[idx]['text'] = [word for word in example['text'] if word != len(word_to_id)]
            if filter:
                if len(self.examples[idx]['text']) > min_length:
                    min_length_articles.append(self.examples[idx])
            self.examples[idx]['url'] = url_to_id.get(example['url'], url_to_id.get("miscellaneous"))
            self.examples[idx]['model_publication'] = publication_to_id.get(example['model_publication'], publication_to_id.get("miscellaneous"))
        return min_length_articles


In [4]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [6]:
sentences = "The White House medical unit and Secret Service will evaluate attendees before they're admitted. They'll be required to test negative for the virus on the day of the event, complete a health questionnaire"
print("Original: ", sentences)

Original:  The White House medical unit and Secret Service will evaluate attendees before they're admitted. They'll be required to test negative for the virus on the day of the event, complete a health questionnaire


In [7]:
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences))

Tokenized:  ['the', 'white', 'house', 'medical', 'unit', 'and', 'secret', 'service', 'will', 'evaluate', 'attendees', 'before', 'they', "'", 're', 'admitted', '.', 'they', "'", 'll', 'be', 'required', 'to', 'test', 'negative', 'for', 'the', 'virus', 'on', 'the', 'day', 'of', 'the', 'event', ',', 'complete', 'a', 'health', 'question', '##naire']


In [None]:
re.findall('[\w]+', sentences.lower())

In [3]:
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize("The White House medical unit and Secret Service will evaluate attendees before they're admitted. They'll be required to test negative for the virus on the day of the event, complete a health questionnaire")))

NameError: name 'tokenizer' is not defined

In [12]:
train_data = Articles("../data/final-data/debugdata/train_basic.json")
val_data = Articles("../data/final-data/debugdata/eval_basic.json")

In [13]:
for idx in range(len(train_data)):
    train_data.examples[idx]['text'] = tokenizer.tokenize(train_data.examples[idx]['text'])
    if len(train_data.examples[idx]['text']) > 512:
        train_data.examples[idx]['text'] = train_data.examples[idx]['text'][:512]
    train_data.examples[idx]['text'] = tokenizer.encode(
                        train_data.examples[idx]['text'],           
                        add_special_tokens = True, 
                        max_length = 512)
    train_data.examples[idx]['model_publication'] = 1 if train_data.examples[idx]['model_publication'] == 'target' else 0

In [10]:
#Create batches with positive samples in first half and negative examples in second half
class BatchSamplerWithNegativeSamples(torch.utils.data.Sampler):
    def __init__(self, pos_sampler, neg_sampler, batch_size, items):
        self._pos_sampler = pos_sampler
        self._neg_sampler = neg_sampler
        self._items = items
        assert batch_size % 2 == 0, 'Batch size must be divisible by two for negative samples.'
        self._batch_size = batch_size
        
    def __iter__(self):
        batch, neg_batch = [], []
        neg_sampler = iter(self._neg_sampler)
        for pos_idx in self._pos_sampler:
            batch.append(pos_idx)
            neg_idx = pos_idx
            # keep sampling until we get a true negative sample
            while self._items[neg_idx] == self._items[pos_idx]:
                try:
                    neg_idx = next(neg_sampler)
                except StopIteration:
                    neg_sampler = iter(self._neg_sampler)
                    neg_idx = next(neg_sampler)
            neg_batch.append(neg_idx)
            if len(batch) == self._batch_size // 2:
                batch.extend(neg_batch)
                yield batch
                batch, neg_batch = [], []
        return

    def __len__(self):
        return len(self._pos_sampler) // self._batch_size

In [11]:
#define function to return necessary data for dataloader to pass into model
def collate_fn(examples):
    words = []
    articles = []
    labels = []
    publications = []
    for example in examples:
        words.append(example['text'])
        articles.append(example['url'])
        labels.append(example['model_publication'])
        publications.append(example['publication'])
    num_words = [len(x) for x in words]
    words = np.concatenate(words, axis=0)
    word_attributes = torch.tensor(words, dtype=torch.long)
    articles = torch.tensor(articles, dtype=torch.long)
    num_words.insert(0,0)
    num_words.pop(-1)
    attribute_offsets = torch.tensor(np.cumsum(num_words), dtype=torch.long)
    publications = torch.tensor(publications, dtype=torch.long)
    real_labels = torch.tensor(labels, dtype=torch.long)
    return publications, articles, word_attributes, attribute_offsets, real_labels

In [2]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
#model = BertForSequenceClassification.from_pretrained("../../Data/Bert/Model")

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 1, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.save_pretrained("../../Data/Bert/Model")

# Tell pytorch to run this model on the GPU.
model.cuda()

In [17]:
import numpy as np
bert_emb = model.bert.embeddings.word_embeddings.weight.data.cpu().numpy()
np.savetxt("/users/rohan/news-classification/data/BERT/u-map/raw_embs.tsv", bert_emb, delimiter='\t')

In [1]:
# import necessary libraries
import pandas as pd
import re
import torch
import collections
import numpy as np
import json
import time
import torch.nn as nn
import os
import argparse
import arguments.train_arguments as arguments
from data_processing.articles import Articles
from models.models import InnerProduct
import data_processing.dictionaries as dictionary
import sampling.sampler_util as sampler_util
import training.eval_util as eval_util
from pathlib import Path
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
from transformers import BertTokenizer


test_data = Articles("/users/rohan/news-classification/data/final-data/test.json")
eval_data = Articles("/users/rohan/news-classification/data/final-data/evaluation.json")
print("Data Loaded")

# initialize tokenizer from BERT library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print("Tokenizer Initialized!")

test_data.tokenize(tokenizer)
print("Test Data Tokenized")
eval_data.tokenize(tokenizer)
print("All Data Tokenized!")

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


Data Loaded
Tokenizer Initialized!
0
5000
10000
Test Data Tokenized
0
5000
10000
All Data Tokenized!


In [2]:
dictionary_dir = Path("/users/rohan/news-classification/data/BERT/dictionaries/")
final_word_ids,final_url_ids, final_publication_ids = dictionary.load_dictionaries(dictionary_dir)

In [None]:
for dataset in [test_data, eval_data]:
    dataset.map_items(tokenizer,
                      final_url_ids,
                      final_publication_ids,
                      filter=False)

In [None]:
eval_data[-1]

In [6]:
with open("/users/rohan/news-classification/data/BERT/mapped-data/test.json", "w") as file:
        json.dump(test_data.examples, file)
with open("/users/rohan/news-classification/data/BERT/mapped-data/evaluation.json", "w") as file:
        json.dump(eval_data.examples, file)

In [3]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(
    "/users/rohan/news-classification/data/BERT/bert-base-uncased.txt", lowercase=True
)

dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_parameters',
 '_tokenizer',
 'add_special_tokens',
 'add_tokens',
 'decode',
 'decode_batch',
 'enable_padding',
 'enable_truncation',
 'encode',
 'encode_batch',
 'encode_tokenized',
 'encode_tokenized_batch',
 'get_vocab',
 'get_vocab_size',
 'id_to_token',
 'no_padding',
 'no_truncation',
 'normalize',
 'num_special_tokens_to_add',
 'post_process',
 'save',
 'token_to_id',
 'train']

In [5]:
tokenizer.encode("Hello, my dog is cute").ids

[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]

In [6]:
tokenizer.decode([101, 7592, 1010, 2026, 3899, 2003, 10140, 102])

'hello, my dog is cute'

In [5]:
import numpy as np

x = np.array([[0], [1], [2]])

In [6]:
x

array([[0],
       [1],
       [2]])

In [8]:
x.shape

(3, 1)

In [12]:
list(np.squeeze(x))

[0, 1, 2]