In [139]:
import re
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

%matplotlib inline
sns.set(color_codes=True)

In [140]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = 2,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

model.load_state_dict(torch.load("./Models/BERT_ft_epoch10.model", map_location=torch.device('cpu')))
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [141]:
df = pd.read_csv('./data_april_1.csv')

In [142]:
df = df[['tweet']]

In [143]:
df = df.sample(n=500)

In [144]:
hashtags = re.compile(r"^#\S+|\s#\S+")
mentions = re.compile(r"^@\S+|\s@\S+")
urls = re.compile(r"https?://\S+")

def process_text(text):
    text = re.sub(r'http\S+', '', text)
    text = hashtags.sub(' hashtag', text)
    text = mentions.sub(' entity', text)
    return text.strip().lower()

In [145]:
df['tweet'] = df.tweet.apply(process_text)

In [146]:
text = df.tweet.values

In [147]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case = True)

In [148]:
encoded_data = tokenizer.batch_encode_plus(df.tweet.values,
                                            add_special_tokens = True,
                                            return_attention_mask = True,
                                            padding = 'longest',
                                            return_tensors = 'pt')

In [149]:
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

dataset = TensorDataset(input_ids, 
                        attention_masks)

batch_size = 32
dataloader = DataLoader(dataset,
                        batch_size = 32)

In [152]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [153]:
model.eval()
    
loss_val_total = 0
predictions = []
    
for batch in tqdm(dataloader):
        
        #load into GPU
        batch = tuple(b.to(device) for b in batch)
        
        #define inputs
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]}

        #compute logits
        with torch.no_grad():        
            outputs = model(**inputs)
        
        #compute loss
        logits = outputs[0]

        #compute accuracy
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)

predictions = np.concatenate(predictions, axis=0)
pred_flat = np.argmax(predictions, axis = 1).flatten()

100%|██████████| 16/16 [01:06<00:00,  4.16s/it]


In [155]:
df['pred_flat'] = pred_flat