In [1]:
pip install transformers pytorch-pretrained-bert

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 28.5MB/s eta 0:00:01[K     |█▏                              | 20kB 5.9MB/s eta 0:00:01[K     |█▊                              | 30kB 8.3MB/s eta 0:00:01[K     |██▎                             | 40kB 10.6MB/s eta 0:00:01[K     |███                             | 51kB 6.9MB/s eta 0:00:01[K     |███▌                            | 61kB 8.1MB/s eta 0:00:01[K     |████                            | 71kB 9.2MB/s eta 0:00:01[K     |████▋                           | 81kB 10.2MB/s eta 0:00:01[K     |█████▎                          | 92kB 8.1MB/s eta 0:00:01[K     |█████▉                          | 102kB 8.8MB/s eta 0:00:01[K     |██████▍                         | 112kB 8.8MB/s eta 0:00:01[K     |███████                         | 122kB 8.

In [2]:
import torch 

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [3]:
from transformers import BertForSequenceClassification, AdamW
from pytorch_pretrained_bert import BertTokenizer

# Reload the saved model and vocabulary
print('Reloading the saved model and vocabulary...')

TRAINED_LGBT_BERT = './drive/My Drive/Colab Notebooks/Bert models/Bert_lgbt_croatian/'

model = BertForSequenceClassification.from_pretrained(TRAINED_LGBT_BERT)
tokenizer = BertTokenizer.from_pretrained(TRAINED_LGBT_BERT, do_lower_case=False)

# Tell the model to run on the GPU
model.cuda()

optimizer = AdamW(model.parameters(), 
                lr=2e-5, # learning rate, default = 5e-5
                eps=1e-8 # adam_epsilon, default = 1e-8
                )

print('Done...')

Reloading the saved model and vocabulary...
Done...


In [0]:
import numpy as np
import time
import datetime

# Helper function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Helper function for formatting elapsed times
def format_time(elapsed):
  '''
  Takes a time in seconds and returns a string in format hh:mm:ss
  '''
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [5]:
import pandas as pd
import gc
import math
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import StepLR

MAX_LENGTH = 499
BATCH_SIZE = 8

# Load the slovenian data
print('Loading the slovenian data...')

CSV_FILE_PATH_SLO = './drive/My Drive/Colab Notebooks/diploma_data/lgbt_homofobija_final.csv'
df_slo = pd.read_csv(CSV_FILE_PATH_SLO, sep=',', header=None, names=['comment', 'label'])

comments = df_slo.comment.values
labels = df_slo.label.values

# Tokenize all of the comments and map the tokens to their word IDs
input_ids=[]
input_labels=[]

for i,comment in enumerate(comments):

  if isinstance(comment, float):
    continue

  tokenized_comment = tokenizer.tokenize(comment)
  tokenized_comment.insert(0, '[CLS]')
  tokenized_comment.append('[SEP]')

  if len(tokenized_comment) > 512:
    continue
  
  comment_ids = tokenizer.convert_tokens_to_ids(tokenized_comment)
  input_ids.append(comment_ids)
  input_labels.append(labels[i])

# Delete the comments and labels so I free memory
del comments
del labels
gc.collect()

print('Size of slovene dataset: {:,}'.format(len(input_ids)))

print('Padding/truncating all the sentences to %d values...' % MAX_LENGTH)
input_ids = pad_sequences(input_ids, maxlen=MAX_LENGTH, dtype='long', value=0, truncating='post', padding='post')

attention_masks= []
for cmnt in input_ids:
  att_mask = [int(token_id > 0) for token_id in cmnt]
  attention_masks.append(att_mask)

# size of training set is 80%, size of validation set is 20%
# random_state makes sure that the splitting is always the same
# split the validation set to actual validation set and testing set which is 10% of original size
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, input_labels, random_state=420, test_size=0.2) 
validation_inputs, test_inputs, validation_labels, test_labels = train_test_split(validation_inputs, validation_labels, random_state=420, test_size=0.5) # 0.5 * 0.2 = 0.1 -> 10% size of testing set

train_masks, validation_masks, _, validation_masks_labels = train_test_split(attention_masks, input_labels, random_state=420, test_size=0.2)
validation_masks, test_masks, _, _ = train_test_split(validation_masks, validation_masks_labels, random_state=420, test_size=0.5)

train_data = TensorDataset(torch.tensor(train_inputs), torch.tensor(train_masks), torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(torch.tensor(validation_inputs), torch.tensor(validation_masks), torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(torch.tensor(test_inputs), torch.tensor(test_masks), torch.tensor(test_labels))
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

# Create the learning rate scheduler
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

Using TensorFlow backend.


Loading the slovenian data...
Size of slovene dataset: 4,454
Padding/truncating all the sentences to 499 values...


In [6]:
###################
#     Testing     #
###################
# Measure the performance on the slovenian testing set.

testing_accuracy_values = []

print('Testing on slovenian data without additional training...')

start_time = time.time()

# Put the model in evaluation mode - the dropout layers behave differently during evaluation.
model.eval()

# Tracking variables
test_accuracy = 0
num_of_batches = 0

print('Number of testing comments: {:,}'.format(len(test_inputs)))
# Evaluate data for one epoch.
for batch in test_dataloader:

  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from out dataloader
  batch_input_ids, batch_attention_mask, batch_labels = batch

  # Telling the model not to compute or store gradients, saves memory and speeds up validation
  with torch.no_grad():
      # Forward pass
      outputs = model(batch_input_ids, token_type_ids=None,
                      attention_mask=batch_attention_mask)
  
  # Get the "logits" output by the model, "logits" are the output values prior to applying an activation function
  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = batch_labels.to('cpu').numpy()

  # Calculate the accuracy for this batch of test sentences
  acc = flat_accuracy(logits, label_ids)

  # Accumulate the total accuracy
  test_accuracy += acc

  # Track the number of batches
  num_of_batches += 1

# Report the final accuracy for this testing run
accuracy = test_accuracy / num_of_batches
testing_accuracy_values.append(accuracy)
print('Accuracy: {0:.3f}'.format(accuracy))
print('Testing took: {:}'.format(format_time(time.time() - start_time)))
print('Testing done...')

Testing on slovenian data without additional training...
Number of testing comments: 446
Accuracy: 0.667
Testing took: 0:00:08
Testing done...
