In [108]:
pip install pytorch-pretrained-bert



In [0]:
from pytorch_pretrained_bert import BertTokenizer

PATH_TO_BERT_FROM_LAB = './drive/My Drive/Colab Notebooks/Bert models/crosloengual-bert-pytorch/'

tokenizer = BertTokenizer.from_pretrained(PATH_TO_BERT_FROM_LAB, do_lower_case=False)

multi_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)


In [140]:
# Load the training sentences
print('Loading the training data...')

import pandas as pd

CSV_FILE_PATH = './drive/My Drive/Colab Notebooks/diploma_data/lgbt_homofobija_hr_final.csv'

df = pd.read_csv(CSV_FILE_PATH, delimiter=',', header=None, names=['comment', 'label'])

print('Number of comments: {:,}\n'.format(df.shape[0]))
df.sample(5)

Loading the training data...
Number of comments: 5,787



Unnamed: 0,comment,label
3217,"Lgbt nabijaju komplekse primitivcima haha, sam...",0
3350,E moj bandicu usral si se,0
371,Crkvo jer spavas pogledaj ovo !,1
3476,Sramotu pokazite,0
3588,"Maa povorci idiotizma,a ne ponosa..sramota! Ka...",0


In [0]:
comments = df.comment.values
labels = df.label.values

# Tokenize all of the comments and map the tokens to their word IDs
input_ids=[]
input_labels=[]
multi_input_ids=[]
multi_input_labels=[]

for i,comment in enumerate(comments):
  # (1) Tokenize the comment.
  # (2) Prepend the '[CLS]' token to the start and append the '[SEP]' token to the end
  # (3) Map tokens to their IDs

  if isinstance(comment, float):
    continue

  tokenized_comment = tokenizer.tokenize(comment)
  tokenized_comment.insert(0, '[CLS]')
  tokenized_comment.append('[SEP]')

  multi_tokenized_comment = multi_tokenizer.tokenize(comment)
  multi_tokenized_comment.insert(0, '[CLS]')
  multi_tokenized_comment.append('[SEP]')

  if len(tokenized_comment) > 512:
    tokenized_comment = tokenized_comment[:512]

  if len(multi_tokenized_comment) > 512:
    multi_tokenized_comment = multi_tokenized_comment[:512]

  comment_ids = tokenizer.convert_tokens_to_ids(tokenized_comment)
  input_ids.append(comment_ids)
  input_labels.append(labels[i])

  comment_ids = multi_tokenizer.convert_tokens_to_ids(multi_tokenized_comment)
  multi_input_ids.append(comment_ids)
  multi_input_labels.append(labels[i])

In [142]:
from keras.preprocessing.sequence import pad_sequences

MAX_LENGTH = 499 # Maximum sentence length in slovenian and english dataset

print('Padding/truncating all the sentences to %d values...' % MAX_LENGTH)

input_ids = pad_sequences(input_ids, maxlen=MAX_LENGTH, dtype='long', value=0, truncating='post', padding='post')
multi_input_ids = pad_sequences(multi_input_ids, maxlen=MAX_LENGTH, dtype='long', value=0, truncating='post', padding='post')

Padding/truncating all the sentences to 499 values...


In [0]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import StepLR

BATCH_SIZE = 8 # Can not be larger (16, 32), because the GPU does not have enough memory

attention_masks = []
for cmnt in input_ids:
  att_mask = [int(token_id > 0) for token_id in cmnt]
  attention_masks.append(att_mask)

# size of training set is 80%, validation is 20% -> we split the validation to 10% for validation set and 10% for test set
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, input_labels, random_state=420, test_size=0.2) # random_state makes sure that the splitting is always the same
validation_inputs, test_inputs, validation_labels, test_labels = train_test_split(validation_inputs, validation_labels, random_state=420, test_size=0.5) # 0.5 * 0.2 = 0.1

# size of training set is 80%, validation is 20% -> we split the validation to 10% for validation set and 10% for test set
multi_train_inputs, multi_validation_inputs, multi_train_labels, multi_validation_labels = train_test_split(multi_input_ids, multi_input_labels, random_state=420, test_size=0.2) # random_state makes sure that the splitting is always the same
multi_validation_inputs, multi_test_inputs, multi_validation_labels, multi_test_labels = train_test_split(multi_validation_inputs, multi_validation_labels, random_state=420, test_size=0.5) # 0.5 * 0.2 = 0.1


In [144]:
from sklearn.dummy import DummyClassifier
import numpy as np

dummy_classifier_uniform = DummyClassifier(strategy='uniform', random_state=420)
dummy_classifier_stratified = DummyClassifier(strategy='stratified', random_state=420)

dummy_classifier_uniform.fit(np.array(train_inputs), np.array(train_labels))
dummy_classifier_stratified.fit(np.array(train_inputs), np.array(train_labels))

print('Accuracy on uniform dummy classifier: {:.3f}'.format(dummy_classifier_uniform.score(np.array(test_inputs), np.array(test_labels))))
print('Accuracy on stratified dummy classifier: {:.3f}'.format(dummy_classifier_stratified.score(np.array(test_inputs), np.array(test_labels))))


Accuracy on uniform dummy classifier: 0.517
Accuracy on stratified dummy classifier: 0.541


In [145]:
multi_dummy_classifier_uniform = DummyClassifier(strategy='uniform', random_state=420)
multi_dummy_classifier_stratified = DummyClassifier(strategy='stratified', random_state=420)

multi_dummy_classifier_uniform.fit(np.array(multi_train_inputs), np.array(multi_train_labels))
multi_dummy_classifier_stratified.fit(np.array(multi_train_inputs), np.array(multi_train_labels))

print('Accuracy on uniform dummy classifier: {:.3f}'.format(multi_dummy_classifier_uniform.score(np.array(multi_test_inputs), np.array(multi_test_labels))))
print('Accuracy on stratified dummy classifier: {:.3f}'.format(multi_dummy_classifier_stratified.score(np.array(multi_test_inputs), np.array(multi_test_labels))))


Accuracy on uniform dummy classifier: 0.517
Accuracy on stratified dummy classifier: 0.541
