In [22]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
pip install highlight_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [25]:
import tensorflow as tf
import torch
import numpy as np
import pandas as pd
import time
import datetime
import random
import sentencepiece
from transformers import XLNetTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification,XLNetConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [26]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [27]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: A100-SXM4-40GB


In [28]:
def colorize(words, color_array):
    cmap=matplotlib.cm.Blues
    template = '{}">{}'
    colored_string = ''
    length = 0
    line_len = 50
    for word, color in zip(words, color_array):
        if (length + len(word)) // line_len  - length // line_len == 1:
            word += '\n'
        length += len(word)
        color *= 0.8
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')

    #display(HTML(colored_string))
    return colored_string

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
output_dir = "./drive/MyDrive/capstone/XLNet_Trained_Model/model_save/"
config = XLNetConfig.from_pretrained(output_dir, output_attentions=True)
model =  XLNetForSequenceClassification.from_pretrained(output_dir, config=config)
tokenizer = XLNetTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)
model.eval()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward

In [31]:
data = pd.read_csv('drive/MyDrive/capstone/Test_Data.csv')
df = data.loc[:,['cleaned_text','peace']]
df.rename(columns = {'peace':'label','cleaned_text':'text'}, inplace = True)
df.text = df.text.astype(str)

In [32]:
def get_attention_score(text):
    encodings = tokenizer([text], padding=True, truncation=True, return_tensors="pt")
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    with torch.no_grad():
      output = model(input_ids=input_ids, attention_mask=attention_mask)[2]

    att_sum = output[0].cpu().reshape(12, len(input_ids[0]), len(input_ids[0])).sum(axis=[0, 1]).numpy()
    # ignore [CLS] and [SEP] in bert tokens
    return att_sum[1: -1] / max(att_sum[1: -1], default = 0), input_ids.cpu()[0][1: -1]

In [33]:
from tqdm import tqdm
VOCAB_SIZE = tokenizer.vocab_size

def get_global_attr_score_sum(list_of_texts):
    attr_score_sum_vec = np.zeros(VOCAB_SIZE)
    word_freq_vec = np.zeros(VOCAB_SIZE)

    for text in tqdm(list_of_texts):
        attr_score, input_ids = get_attention_score(text)
        word_freq_vec[input_ids.numpy()] += 1
        attr_score_sum_vec[input_ids.numpy()] += attr_score

    return attr_score_sum_vec, word_freq_vec


def get_top_words(word_weight_vec, n):
    top_n_indices = np.argsort(word_weight_vec)[::-1][:n]
    top_weight_vec = word_weight_vec[top_n_indices]
    return tokenizer.convert_ids_to_tokens(top_n_indices), top_weight_vec


In [34]:
pos_data = df[df['label'] == 1].reset_index(drop=True)
neg_data = df[df['label'] == 0].reset_index(drop=True)

In [35]:
pos_attr_score_sum_vec, pos_word_freq_vec = get_global_attr_score_sum(pos_data['text'])
pos_attr_score_avg_vec = pos_attr_score_sum_vec / (pos_word_freq_vec + 200)
pos_top_words, pos_top_weights = get_top_words(pos_attr_score_avg_vec, 100)
pos_top_words, pos_top_weights

  0%|          | 0/61688 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 61688/61688 [20:28<00:00, 50.22it/s]


(['british',
  'european',
  '▁meanwhile',
  '▁accuse',
  '▁allege',
  '▁example',
  '▁spokesman',
  'canadian',
  '▁headline',
  '▁amid',
  '▁advertise',
  '▁ago',
  '▁involve',
  '▁announce',
  '▁injure',
  '▁pandemic',
  '▁include',
  '▁despite',
  '▁election',
  'chinese',
  '<sep>',
  '▁largest',
  'american',
  '▁statement',
  '▁investigate',
  '▁politician',
  '▁arrive',
  '▁celebrate',
  '▁country',
  '▁decade',
  '▁describe',
  '▁nation',
  '▁million',
  '▁warn',
  '▁determine',
  '▁allow',
  '▁vote',
  '▁parliament',
  '▁incident',
  '▁require',
  '▁admit',
  '▁allegedly',
  '▁scene',
  'african',
  '▁allegation',
  '▁arrest',
  '▁week',
  '▁investigation',
  '▁confirm',
  '▁resign',
  '▁interview',
  '▁french',
  '▁yesterday',
  '▁latest',
  '▁billion',
  '▁former',
  '▁biggest',
  '▁authority',
  '▁decide',
  '▁journalist',
  '▁ensure',
  '▁argue',
  '▁reveal',
  '▁favourite',
  '▁however',
  '▁disappoint',
  '▁agree',
  '▁dozen',
  '▁begin',
  '▁minister',
  '▁cite',
  '▁e

In [36]:
neg_attr_score_sum_vec, neg_word_freq_vec = get_global_attr_score_sum(neg_data['text'])

100%|██████████| 62025/62025 [20:38<00:00, 50.10it/s]


In [37]:
neg_attr_score_avg_vec = neg_attr_score_sum_vec / (neg_word_freq_vec + 200)
neg_top_words, neg_top_weights = get_top_words(neg_attr_score_avg_vec, 100)
neg_top_words, neg_top_weights

(['▁allege',
  '▁accuse',
  'according',
  'african',
  '▁meanwhile',
  '▁remark',
  '▁announce',
  '▁appoint',
  '▁spokesperson',
  '▁example',
  'european',
  '▁allegedly',
  '▁involve',
  '▁indicate',
  '▁cooperation',
  '▁spokesman',
  '▁ago',
  '▁include',
  '▁statement',
  '▁conclude',
  '▁election',
  '▁ceremony',
  '▁disclose',
  '▁allegation',
  '▁violation',
  '▁pandemic',
  '▁despite',
  '▁inform',
  '▁constitution',
  '▁respectively',
  '▁ensure',
  '<sep>',
  '▁facilitate',
  '▁country',
  '▁promote',
  '▁sanction',
  '▁chairman',
  'british',
  '▁reiterate',
  '▁journalist',
  '▁defeat',
  '▁arrest',
  '▁nation',
  '▁require',
  '▁regard',
  '▁addition',
  '▁describe',
  '▁achieve',
  '▁commence',
  '▁implement',
  '▁injure',
  '▁implementation',
  '▁declare',
  '▁politician',
  '▁arrive',
  '▁celebrate',
  '▁behalf',
  '▁instance',
  '▁interview',
  '▁institution',
  '▁million',
  '▁yesterday',
  '▁allow',
  '▁comprise',
  '▁secretary',
  '▁quote',
  '▁tourism',
  '▁dete

In [38]:
sentences = df['text'].values
labels = df['label'].values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [39]:

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

    DONE.


In [40]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [41]:
from sklearn.metrics import confusion_matrix

(tn, fp), (fn, tp) = confusion_matrix(flat_predictions, flat_true_labels)
print('tn:', tn, 'fp:', fp, 'fn:', fn, 'tp:', tp)
print(confusion_matrix(flat_predictions, flat_true_labels))

tn: 56016 fp: 3827 fn: 6009 tp: 57861
[[56016  3827]
 [ 6009 57861]]
