In [1]:
import torch
import random
import numpy as np

In [2]:
torch.cuda.empty_cache()
#torch.cuda.memory_summary(device=None, abbreviated=False)

In [3]:
from transformers import BertTokenizer
from transformers import AutoModel, AutoTokenizer

# https://huggingface.co/activebus/BERT-XD_Review
tokenizer = AutoTokenizer.from_pretrained("activebus/BERT-XD_Review", do_lower_case=True)

In [4]:
len(tokenizer.vocab)

30522

In [5]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [6]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [7]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [8]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [9]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [10]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [11]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [12]:
from torchtext.legacy import data
from torchtext.legacy import datasets


E = data.LabelField(dtype = torch.float, batch_first=True, sequential=False, use_vocab=False) # The Expertise

D = data.LabelField(dtype = torch.float, batch_first=True, sequential=False, use_vocab=False) # The Review Age in Days


TEXT = data.Field(batch_first=True, use_vocab = False, include_lengths = True,
                      tokenize = tokenize_and_cut, preprocessing = tokenizer.convert_tokens_to_ids,
                      init_token = init_token_idx, eos_token = eos_token_idx,
                      pad_token = pad_token_idx, unk_token = unk_token_idx)  # Review Text

LABEL = data.LabelField(dtype = torch.long, batch_first=True, sequential=False, use_vocab=False) # Helpfulness Label

In [13]:
fields = {'expertise': ('e', E), 'review_days': ('d', D), 'review_text': ('text', TEXT), 'helpful_class': ('label', LABEL)}

In [14]:
root_path = './dataset/Experiment/'

In [15]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = root_path,
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields
)

In [16]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 145381
Number of validation examples: 8080
Number of testing examples: 8080


In [17]:
print(vars(train_data[1]))

{'e': 0.002317113538563, 'd': 0.9611171960569551, 'text': [4370, 2045, 1037, 2261, 2335, 2023, 100, 2004, 1037, 6976, 10367, 2000, 16420, 1998, 100, 2940, 1045, 2293, 2129, 2673, 2055, 2023, 2173, 2003, 100, 1996, 100, 1996, 6912, 2686, 1998, 1996, 100, 1045, 2034, 2234, 2000, 2131, 2185, 2013, 1996, 9045, 1997, 2586, 100, 1998, 4370, 2005, 2129, 2012, 2188, 2009, 2081, 100, 100, 2057, 5247, 1037, 2843, 1997, 2051, 1999, 1996, 100, 100, 2551, 100, 100, 2096, 2108, 5845, 2066, 16664, 2011, 1996, 8422, 2457, 2136, 100, 100, 14736, 1998, 1037, 3626, 1997, 5541, 1998, 2326, 8048, 3347, 100, 1045, 16755, 2023, 100], 'label': 0}


In [18]:
#print text
print(vars(train_data.examples[2]))

{'e': 0.001458168886172, 'd': 0.6067907995618831, 'text': [1045, 4370, 2182, 1999, 2249, 2307, 3942, 3866, 1996, 100, 3435, 2830, 1020, 2086, 17829, 2081, 1017, 2706, 3805, 1997, 100, 4484, 2068, 1016, 2335, 2007, 2070, 5379, 2111, 2006, 1996, 100, 3198, 2065, 2057, 2071, 2681, 2256, 8641, 2007, 100, 2065, 1996, 2282, 100, 3201, 2007, 1996, 2111, 2006, 1996, 100, 2053, 3291, 1045, 2001, 2409, 2021, 2175, 3805, 1998, 3198, 2068, 2065, 2115, 2282, 2003, 3201, 2065, 2009, 2003, 2027, 2097, 2292, 2017, 4638, 100, 2588, 5508, 1045, 2175, 2000, 1996, 4624, 2025, 8074, 2000, 2022, 7039, 1999, 1998, 1996, 2611, 2758, 2200, 100, 2008, 2009, 2001, 2000, 2220, 2005, 4638, 100, 2029, 1045, 4415, 5319, 1998, 2074, 2359, 2000, 4638, 2026, 8641, 2061, 2057, 2071, 2707, 4356, 100, 2053, 2342, 2005, 100, 2027, 2435, 2033, 1037, 2367, 2561, 2084, 2054, 1045, 2001, 2409, 2006, 1996, 3042, 2061, 1045, 3092, 2039, 7079, 2062, 2005, 1037, 5958, 2305, 2282, 2084, 1045, 2323, 100, 1045, 2001, 2025, 1999, 1996

In [19]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[2])['text'])

print(tokens)

['i', 'stayed', 'here', 'in', '2011', 'great', 'visit', 'loved', 'the', '[UNK]', 'fast', 'forward', '6', 'years', 'reservations', 'made', '3', 'months', 'ahead', 'of', '[UNK]', 'confirmed', 'them', '2', 'times', 'with', 'some', 'friendly', 'people', 'on', 'the', '[UNK]', 'ask', 'if', 'we', 'could', 'leave', 'our', 'bags', 'with', '[UNK]', 'if', 'the', 'room', '[UNK]', 'ready', 'with', 'the', 'people', 'on', 'the', '[UNK]', 'no', 'problem', 'i', 'was', 'told', 'but', 'go', 'ahead', 'and', 'ask', 'them', 'if', 'your', 'room', 'is', 'ready', 'if', 'it', 'is', 'they', 'will', 'let', 'you', 'check', '[UNK]', 'upon', 'arrival', 'i', 'go', 'to', 'the', 'desk', 'not', 'expecting', 'to', 'be', 'checked', 'in', 'and', 'the', 'girl', 'says', 'very', '[UNK]', 'that', 'it', 'was', 'to', 'early', 'for', 'check', '[UNK]', 'which', 'i', 'clearly', 'understood', 'and', 'just', 'wanted', 'to', 'check', 'my', 'bags', 'so', 'we', 'could', 'start', 'sight', '[UNK]', 'no', 'need', 'for', '[UNK]', 'they', 'g

In [20]:
LABEL.build_vocab(train_data)

E.build_vocab(train_data)
D.build_vocab(train_data)

In [21]:
print(LABEL.vocab.stoi)

defaultdict(None, {0: 0, 1: 1, 2: 2, 3: 3, 4: 4})


In [22]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    shuffle=True,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    device = device)

In [23]:
print(device)

cuda


In [24]:
# Print number of batches in each split
print('Created `torchtext_train_dataloader` with %d batches!'%len(train_iterator))
print('Created `torchtext_valid_dataloader` with %d batches!'%len(valid_iterator))
print('Created `torchtext_test_dataloader` with %d batches!'%len(test_iterator))

Created `torchtext_train_dataloader` with 4544 batches!
Created `torchtext_valid_dataloader` with 253 batches!
Created `torchtext_test_dataloader` with 253 batches!


In [25]:
# Create batches - needs to be called before each loop.
test_iterator.create_batches()

# Loop through BucketIterator.
print('PyTorchText BuketIterator\n')
for batch in test_iterator.batches:

  # Let's check batch size.
  print('Batch size: %d\n'% len(batch))
  #print('LABEL\tLENGTH\tTEXT'.ljust(5))

  # Print each example.
  for example in batch:
    print('%s\t%d \t \t %s'.ljust(10) % (example.label, len(example.text), example.text))
  print('\n')

    # Print each example.
  for example in batch:
    print('%d \t %f \t %f '.ljust(10) % (len(example.text), example.e, example.d))
  print('\n')

  # Only look at first batch. Reuse this code in training models.
  break

PyTorchText BuketIterator

Batch size: 32

1	19 	 	 [5723, 100, 2053, 2980, 2300, 1999, 1996, 2851, 2000, 100, 2027, 2342, 2000, 4550, 2488, 1998, 2131, 2047, 100]
0	20 	 	 [3835, 3309, 2007, 2200, 3835, 5379, 14044, 100, 1996, 2069, 2613, 12087, 2001, 4026, 100, 2060, 2084, 2008, 2307, 100]
0	25 	 	 [2307, 2131, 2185, 1010, 2307, 2833, 5379, 1998, 14044, 100, 2307, 3295, 3733, 3229, 2000, 6023, 1998, 7419, 100, 6581, 2097, 2272, 2067, 2293, 100]
1	25 	 	 [2753, 100, 5186, 2502, 100, 100, 1998, 6625, 9705, 2007, 3528, 1997, 100, 100, 2379, 6005, 1998, 4500, 2000, 1996, 4680, 100, 100, 100, 6749]
0	26 	 	 [4283, 1011, 2428, 3835, 2173, 1999, 1037, 2307, 100, 1045, 2074, 4299, 2027, 2018, 18833, 8831, 2000, 3715, 2026, 18059, 2144, 1045, 9471, 1996, 2813, 100]
0	26 	 	 [2200, 3835, 3295, 2485, 2000, 1996, 100, 1999, 2001, 100, 2001, 100, 2003, 100, 2003, 100, 100, 9726, 1998, 5742, 100, 100, 2041, 2001, 100, 100]
0	27 	 	 [2200, 3835, 1998, 4550, 100, 3835, 2715, 1998, 2152, 7216, 100, 1

In [26]:
# https://huggingface.co/activebus/BERT-XD_Review
bert = AutoModel.from_pretrained("activebus/BERT-XD_Review")

print(bert.config.to_dict()['hidden_size'])

Some weights of the model checkpoint at activebus/BERT-XD_Review were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [27]:
import torch.nn as nn

class BERTHelpful(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.mlp_bert = nn.Linear(embedding_dim, hidden_dim)
        
        self.mlp_len = nn.Linear(1, 1)
        
        self.mlp_days = nn.Linear(1, 1)
        
        self.mlp_exp = nn.Linear(1, 1)
        
        self.out = nn.Linear(hidden_dim + 3, output_dim)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths, exp, days):
        
        #text = [batch size, sent len]
        
        # Feed input to BERT
        with torch.no_grad():
            embedded = self.bert(text)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        hidden = embedded[0][:, 0, :]
        
        # https://discuss.huggingface.co/t/what-is-the-purpose-of-the-additional-dense-layer-in-classification-heads/526
        hidden = self.dropout(hidden)
        
        intermediate = self.relu(self.mlp_bert(hidden))
        
        reshaped_days = days.view(len(days), 1)
        weights_days = self.mlp_days(reshaped_days)
        
        reshaped_exp = exp.view(len(exp), 1)
        weights_exp = self.mlp_exp(reshaped_exp)
        
        # https://www.tutorialspoint.com/how-to-perform-element-wise-division-on-tensors-in-pytorch
        reshaped_len = text_lengths.view(len(days), 1)
        norm_length = torch.div(reshaped_len, 512)
        weights_len = self.mlp_len(norm_length)
        
        fusion = torch.cat((weights_days, intermediate), dim = 1)
        
        fusion = torch.cat((weights_exp, fusion), dim = 1)
        
        fusion = torch.cat((weights_len, fusion), dim = 1)
        
        output = self.out(fusion)
        
        #output = [batch size, out dim]
        
        return output

In [28]:
HIDDEN_DIM = 50
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.2

N_EPOCHS = 5

model = BERTHelpful(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         DROPOUT)

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,520,966 trainable parameters


In [30]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = True

In [31]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,520,966 trainable parameters


In [32]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [33]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [34]:
#architecture
print(model)

BERTHelpful(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


###  Optimizer & Learning Rate Scheduler

In [35]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Create the optimizer
optimizer_test = AdamW(model.parameters(),
                           lr=3e-5,    # Default learning rate
                           eps=1e-8    # Default epsilon value
                          )

# Total number of training steps
total_steps = len(train_iterator) * N_EPOCHS
    
# Set up the learning rate scheduler
scheduler_test = get_linear_schedule_with_warmup(optimizer,
                                                            num_warmup_steps=0, # Default value
                                                            num_training_steps=total_steps)

In [36]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

## Reproduceability

In [37]:
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)    # Set seed for reproducibility

print("Reproduceability Set!!!!")

Reproduceability Set!!!!


In [38]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    i = 0

    #print(len(iterator))

    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        exp = batch.e
        
        days = batch.d
        
        predictions = model(text, text_lengths, exp, days).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

        i = i + 1

        #print(i)

        if i == len(iterator):
          break
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [39]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    i = 0
    
    model.eval()

    #print(len(iterator))
    
    with torch.no_grad():
    
        for batch in iterator:
            
            text, text_lengths = batch.text
            
            exp = batch.e
        
            days = batch.d
            
            predictions = model(text, text_lengths, exp, days).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            i = i + 1

            #print(i)

            if i == len(iterator):
              break
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [40]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [41]:
save_path = './Saved Models/Experiment#2/'

In [42]:
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), save_path + 'best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 3m 41s
	Train Loss: 0.856 | Train Acc: 62.91%
	 Val. Loss: 0.845 |  Val. Acc: 63.07%
Epoch: 02 | Epoch Time: 3m 42s
	Train Loss: 0.832 | Train Acc: 63.45%
	 Val. Loss: 0.817 |  Val. Acc: 63.66%
Epoch: 03 | Epoch Time: 3m 42s
	Train Loss: 0.819 | Train Acc: 64.06%
	 Val. Loss: 0.809 |  Val. Acc: 64.14%
Epoch: 04 | Epoch Time: 3m 43s
	Train Loss: 0.810 | Train Acc: 64.63%
	 Val. Loss: 0.800 |  Val. Acc: 64.81%
Epoch: 05 | Epoch Time: 3m 43s
	Train Loss: 0.805 | Train Acc: 65.10%
	 Val. Loss: 0.798 |  Val. Acc: 65.40%


## Evaluation

In [43]:
true_labels = []
pred_labels = []

def test_categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    
    pred_labels.append(top_pred.squeeze().tolist())
    true_labels.append(y.squeeze().tolist())
    
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [44]:
def test(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    i = 0
    
    model.eval()

    #print(len(iterator))
    
    with torch.no_grad():
    
        for batch in iterator:
            
            text, text_lengths = batch.text
            
            exp = batch.e
        
            days = batch.d
            
            predictions = model(text, text_lengths, exp, days).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = test_categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            i = i + 1

            #print(i)

            if i == len(iterator):
              break
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [45]:
model.load_state_dict(torch.load(save_path + 'best-model.pt'))

test_loss, test_acc = test(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.799 | Test Acc: 65.18%


In [46]:
test_true_labels = [ item for elem in true_labels for item in elem]
test_pred_labels = [ item for elem in pred_labels for item in elem]

#print(test_true_labels)
#print(test_true_labels)

In [47]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print("MAE")
print(mean_absolute_error(test_true_labels, test_pred_labels))
print("----------------------------------------------------")
print("MSE")
print(mean_squared_error(test_true_labels, test_pred_labels))

MAE
0.39282178217821784
----------------------------------------------------
MSE
0.49084158415841583
