In [4]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

os.chdir("C:\\Users\\Thais\\Documents\\Python\\bcb-sentiment-analysis")

In [2]:
# set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [6]:
# load data
data = pd.read_excel('.\\data\\sentiment_data\\Dataset_BC_raw.xlsx')
sentences = data['text'].values
labels = data['label'].values

In [50]:
# tokenize & encode data
input_ids = [tokenizer.encode(sent, add_special_tokens = True) for sent in sentences]

In [55]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# set maximum sequence length
max_len = 128

# pad and truncate sequences
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")

In [57]:
# creating attention masks
attention_masks = []
for seq in input_ids:
    seq_mask = [int(token_id != tokenizer.pad_token_id) for token_id in seq]
    attention_masks.append(seq_mask)

In [77]:
# split data into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

In [78]:
#Adjusting tensor size for labels in Train
one_hot_labels_train = []
for label in train_labels:
    if label == "positive":
        one_hot_labels_train.append([1, 0, 0])
    elif label == "negative":
        one_hot_labels_train.append([0, 1, 0])
    else:
        one_hot_labels_train.append([0, 0, 1])

#Adjusting tensor size in Test
one_hot_labels_test = []
for label in test_labels:
    if label == "positive":
        one_hot_labels_test.append([1, 0, 0])
    elif label == "negative":
        one_hot_labels_test.append([0, 1, 0])
    else:
        one_hot_labels_test.append([0, 0, 1])

In [None]:
# #Converting labels into numbers
# label_map = {"positive": 1, "negative": 0, "neutral": 0.5}
# train_labels = [label_map[label] for label in train_labels]
# test_labels = [label_map[label] for label in test_labels]

In [79]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(one_hot_labels_train, dtype=torch.long)
test_labels = torch.tensor(one_hot_labels_test, dtype=torch.long)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)

In [80]:
# create DataLoader objects
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# set optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)

  This is separate from the ipykernel package so we can avoid doing imports until
  


In [82]:
# fine-tune model
epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels.long())
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print('Average training loss: ', avg_train_loss)

    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = logits[0].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:76] data. DefaultCPUAllocator: not enough memory: you tried to allocate 50331648 bytes.