In [1]:
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification

"""
    Loading the BERT base model onto the gpu before any of the preprocessing steps to avoid out of memory error on
    cheaha
"""
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle as pk
import gzip

Using TensorFlow backend.


In [3]:
"""
    Store the device. We transfer our data on this ydevice during training
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
"""
    Open the pickled Amazon reviews file
"""
f =gzip.open('data/final/Amazon.pkl', 'rb')
reviews = pk.load(f)
reviews = reviews.dropna()
reviews.shape

In [None]:
"""
    Read Yelp Data
"""
yelp = pd.read_csv('data/final/yelp.csv')
yelp = yelp[['text', 'stars']]
yelp.loc[:, 'stars'] = yelp.stars.apply(lambda x : 1 if x == 4 or x == 5 else 0)
yelp = yelp.dropna()
yelp.shape

In [None]:
"""
    Read IMDB data
"""
imdb = pd.read_csv('data/final/imdb.csv')
imdb = imdb[['review', 'sentiment']]
imdb = imdb.rename(columns={'review':'text', 'sentiment':'stars'})
imdb.loc[:, 'stars'] = imdb.stars.apply(lambda x : 1 if x == "positive" else 0)
imdb = imdb.dropna()
imdb = imdb.replace(r'\<br /><br />',' ', regex=True) #Remove line breaks
imdb.shape

In [30]:
"""
    Separate our reviews and ratings. Using only a 100 reviews for initial reviews
    Convert ratings into a binary class. 1-3 stars as negative (0) and 4-5 as positive (1)
"""

text = reviews.reviewText.values[:100000] # Using 100k for faster training, can be increased
labels = reviews.overall.apply(lambda x : 1 if x == 4 or x == 5 else 0 ).values[:100000]
text = ["[CLS] " + sent + " [SEP]" for sent in text]

"""
    Format the review text into the format required for BERT
    by adding [CLS] and [SEP] tokens 
"""
yelp_text = yelp.text.values
yelp_labels = yelp.stars.values
yelp_text =  ["[CLS] " + sent + " [SEP]" for sent in yelp_text]

imdb_text = imdb.text.values
imdb_labels = imdb.stars.values
imdb_text =  ["[CLS] " + sent + " [SEP]" for sent in imdb_text]

In [31]:
"""
    Lower cased bert base tokenizer
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
"""
    Use BERT Tokenizer to tokenize our review text 
"""
tokenized_text_az = [tokenizer.tokenize(sent) for sent in text]
tokenized_text_yelp = [tokenizer.tokenize(sent) for sent in yelp_text]
tokenized_text_imdb = [tokenizer.tokenize(sent) for sent in imdb_text]

In [32]:
"""
    Because BERT expects a sequence length of 512 or less, we are extracting reviews that are less than 512 in length.
    Try reducing the sequence length to 128 if there are memory issues on cheaha
"""
def trim_sentences(tokenized_list, label):
    long_seq = []
    ratings = []
    for idx, tok in enumerate(tokenized_list):
        if len(tok) < 512:
            long_seq.append(tok)
            ratings.append(label[idx])
    return (long_seq, ratings)

long_seq, ratings = trim_sentences(tokenized_text_az, labels)
long_seq_yelp, yelp_ratings = trim_sentences(tokenized_text_yelp, yelp_labels)
long_seq_imdb, imdb_ratings = trim_sentences(tokenized_text_imdb, imdb_labels)


print(f"Amazon reviews length after trim: {len(long_seq)}")
print(f"Amazon rating length after trim: {len(ratings)}")

print(f"Yelp reviews length after trim: {len(long_seq_yelp)}")
print(f"Yelp rating length after trim: {len(yelp_ratings)}")

print(f"IMDB reviews length after trim: {len(long_seq_imdb)}")
print(f"IMDB rating length after trim: {len(imdb_ratings)}")

Amazon reviews length after trim: 93310
Amazon rating length after trim: 93310
Yelp reviews length after trim: 9661
Yelp rating length after trim: 9661
IMDB reviews length after trim: 43569
IMDB rating length after trim: 43569


In [33]:
"""
    BERT expects a input id for each token in our reviews. Using a function from the BERTTokenizer to 
    convert tokens into ids
"""
input_ids = [tokenizer.convert_tokens_to_ids(token) for token in long_seq]
yelp_input_ids = [tokenizer.convert_tokens_to_ids(token) for token in long_seq_yelp]
imdb_input_ids = [tokenizer.convert_tokens_to_ids(token) for token in long_seq_imdb]

In [34]:
"""
    Convert all input_ids into a uniform length of 128. Add padding to tokens less than 128 in length.
    
    Again, the max length here can be higher but using 128 to see if it fixes memory uses around Cheaha
    
"""
input_ids = pad_sequences(input_ids, maxlen=128, dtype="long", truncating="post", padding="post")
yelp_input_ids = pad_sequences(yelp_input_ids, maxlen=128, dtype="long", truncating="post", padding="post")
imdb_input_ids = pad_sequences(imdb_input_ids, maxlen=128, dtype="long", truncating="post", padding="post")

In [35]:
"""
    Setting up attention masks to input into the model
"""
def generate_attention_masks(input_ids):
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks
attention_masks = generate_attention_masks(input_ids)
attention_masks_yelp = generate_attention_masks(yelp_input_ids)
attention_masks_imdb = generate_attention_masks(imdb_input_ids)

In [36]:
"""
    Set train and validation data
"""
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, ratings, 
                                                            random_state=18, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=18, test_size=0.1)

In [37]:
"""
    Convert our data into tensors
"""
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

yelp_validation_inputs = torch.tensor(yelp_input_ids)
yelp_validation_labels = torch.tensor(yelp_ratings)
yelp_validation_masks = torch.tensor(attention_masks_yelp)

imdb_validation_inputs = torch.tensor(imdb_input_ids)
imdb_validation_labels = torch.tensor(imdb_ratings)
imdb_validation_masks = torch.tensor(attention_masks_imdb)

In [38]:
"""
    Recommed batch sizes are 16 or 32 but use a small batch size if there are memory issues on Cheaha
    
"""
batch_size = 32

"""
    Get an iterable dataset for Amazon
"""
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [39]:
"""
    Get an iterable dataset for Yelp
"""
yelp_validation_data = TensorDataset(yelp_validation_inputs, yelp_validation_masks, yelp_validation_labels)
yelp_validation_sampler = SequentialSampler(yelp_validation_data)
yelp_validation_dataloader = DataLoader(yelp_validation_data, sampler=yelp_validation_sampler, batch_size=batch_size)

In [40]:
"""
    Get an iterable dataset for IMDB
"""
imdb_validation_data = TensorDataset(imdb_validation_inputs, imdb_validation_masks, imdb_validation_labels)
imdb_validation_sampler = SequentialSampler(imdb_validation_data)
imdb_validation_dataloader = DataLoader(imdb_validation_data, sampler=imdb_validation_sampler, batch_size=batch_size)


In [41]:
"""
    Get the parameter names from model and setup the parameters to be passed
    to the Adam optimizer in the next step
"""
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [42]:
"""
    Initialize Adam optimizer
"""
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [43]:
"""
    Predict accuracy of our model
"""
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [44]:
"""
   Store our training losss and set the number of epochs. 
   Fine-tuning requires 2 to 4 passes over the entire dataset
"""
train_loss_set = []
epochs = 4

for _ in trange(epochs, desc="Epoch"):
    """
       Start Training
       """
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    """
       Loop over the data set. Transform the tensors to the gpu. 
       Provide the training data to the model.
       Performa forward and backward pass.
       Store the loss and update parameters for the optimizers
       """
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad() # clear gradient after every step
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss.item())    
        loss.backward()
        optimizer.step()
    
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))
    """
        Evaluate our trained model
    """
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
  
    """
        Make predictions on our validation dataset 
    """
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    print("\nValidation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


Train loss: 0.7195597290992737


Epoch:  25%|██▌       | 1/4 [21:02<1:03:08, 1262.80s/it]


Validation Accuracy: 0.8421052631578947

Train loss: 0.09748785942792892


Epoch:  50%|█████     | 2/4 [41:47<41:55, 1257.50s/it]  


Validation Accuracy: 0.8947368421052632

Train loss: 0.01057206466794014


Epoch:  75%|███████▌  | 3/4 [1:02:30<20:53, 1253.01s/it]


Validation Accuracy: 0.8421052631578947

Train loss: 0.0010580149246379733


Epoch: 100%|██████████| 4/4 [1:23:14<00:00, 1250.17s/it]


Validation Accuracy: 0.8947368421052632





In [45]:
from sklearn.metrics import f1_score, recall_score, precision_score
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
preds , true_labels = [], []
"""
   Make predictions on our validation dataset 
"""
for batch in yelp_validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    preds.append(logits)
    true_labels.append(label_ids)


In [46]:
flat_predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

In [47]:
f1_test =f1_score(flat_predictions, flat_true_labels)
recall_test = recall_score(flat_predictions, flat_true_labels)
precision_test = precision_score(flat_predictions, flat_true_labels)

In [48]:
print(f"Validation Accuracy for Yelp: {format(eval_accuracy/nb_eval_steps)}")
print(f"F1 Score for Yelp: {format(f1_test)}")
print(f"Recall Score  for Yelp: {format(recall_test)}")
print(f"Precision Score for Yelp: {format(precision_test)}")


Validation Accuracy for Yelp: 0.8396822904772779
F1 Score for Yelp: 0.8929435344529684
Recall Score  for Yelp: 0.8292682926829268
Precision Score for Yelp: 0.9672106602784848


In [49]:
preds_az = []
labels_az = []
eval_loss, eval_accuracy= 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    preds_az.append(logits)
    labels_az.append(label_ids)


In [50]:
flat_predictions = [item for sublist in preds_az for item in sublist]
flat_predictions_az = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels_az = [item for sublist in labels_az for item in sublist]

In [51]:
f1_test =f1_score(flat_predictions_az, flat_true_labels_az)
recall_test = recall_score(flat_predictions_az, flat_true_labels_az)
precision_test = precision_score(flat_predictions_az, flat_true_labels_az)

In [52]:
print(f"Validation Accuracy for Amazon: {format(eval_accuracy/nb_eval_steps)}")
print(f"F1 Score for Amazon: {format(f1_test)}")
print(f"Recall Score  for Amazon: {format(recall_test)}")
print(f"Precision Score for Amazon: {format(precision_test)}")

Validation Accuracy for Amazon: 0.9019297494592646
F1 Score for Amazon: 0.9342814048696402
Recall Score  for Amazon: 0.9347513653348664
Precision Score for Amazon: 0.9338119167264896


In [53]:
preds_im = []
labels_im = []
eval_loss, eval_accuracy= 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in imdb_validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    preds_im.append(logits)
    labels_im.append(label_ids)

In [54]:
flat_predictions = [item for sublist in preds_im for item in sublist]
flat_predictions_im = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels_im = [item for sublist in labels_im for item in sublist]

In [55]:
f1_test =f1_score(flat_predictions_im, flat_true_labels_im)
recall_test = recall_score(flat_predictions_im, flat_true_labels_im)
precision_test = precision_score(flat_predictions_im, flat_true_labels_im)

In [56]:
print(f"Validation Accuracy for IMDB: {format(eval_accuracy/nb_eval_steps)}")
print(f"F1 Score for IMDB: {format(f1_test)}")
print(f"Recall Score  for IMDB: {format(recall_test)}")
print(f"Precision Score for IMDB: {format(precision_test)}")

Validation Accuracy for IMDB: 0.815786419193228
F1 Score for IMDB: 0.8296585531481443
Recall Score  for IMDB: 0.7672501766229688
Precision Score for IMDB: 0.9031185031185032
