In [1]:
#https://github.com/prestonlimlianjie/bert-sentiment-analysis-straits-times/blob/master/BERT_sentiment_analysis.ipynb
import sys
!{sys.executable} -m pip install torch transformers pandas scikit-learn



In [0]:
# Define utils functions

def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[int]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (int): padding token
    @returns sents_padded (list[list[int]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
        Output shape: (batch_size, max_sentence_length)
    """
    sents_padded = []

    max_len = max(len(s) for s in sents)
    batch_size = len(sents)

    for s in sents:
        padded = [pad_token] * max_len
        padded[:len(s)] = s
        sents_padded.append(padded)

    return sents_padded

def sents_to_tensor(tokenizer, sents, device):
    """
    :param tokenizer: BertTokenizer
    :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences), reversely sorted
    :param device: torch.device
    :return: sents_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: masks_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: sents_lengths: torch.Tensor, shape(batch_size), reversely sorted
    """
    tokens_list = [tokenizer.tokenize(sent) for sent in sents]
    sents_lengths = [len(tokens) for tokens in tokens_list]
    # tokens_sents_zip = zip(tokens_list, sents_lengths)
    # tokens_sents_zip = sorted(tokens_sents_zip, key=lambda x: x[1], reverse=True)
    # tokens_list, sents_lengths = zip(*tokens_sents_zip)
    tokens_list_padded = pad_sents(tokens_list, '[PAD]')
    sents_lengths = torch.tensor(sents_lengths, device=device)

    masks = []
    for tokens in tokens_list_padded:
        mask = [0 if token=='[PAD]' else 1 for token in tokens]
        masks.append(mask)
    masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
    tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
    sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)

    return sents_tensor, masks_tensor, sents_lengths

In [0]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch import nn
import torch.nn.functional as F

In [0]:
class SentimentClassifierModel(nn.Module):

    def __init__(self, bert_config, device, n_class):
        """
        :param bert_config: str, BERT configuration description
        :param device: torch.device
        :param n_class: int
        """

        super(SentimentClassifierModel, self).__init__()

        self.n_class = n_class
        self.bert_config = bert_config
        self.bert = BertForSequenceClassification.from_pretrained(self.bert_config, num_labels=self.n_class)
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_config)
        self.device = device

    def forward(self, sents):
        """
        :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences)
        :return: pre_softmax, torch.tensor of shape (batch_size, n_class)
        """

        sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
        pre_softmax = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)

        return pre_softmax

    @staticmethod
    def load(model_path: str, device):
        """ Load the model from a file.
        @param model_path (str): path to model
        @return model (nn.Module): model with saved parameters
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = SentimentClassifierModel(device=device, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the model to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(bert_config=self.bert_config, n_class=self.n_class),
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

In [5]:
import pandas

pwd = '/content/drive'

#, usecols=['review_rate','review_body'
df= pandas.read_csv("/content/drive/My Drive/BERT/ANTON/final.csv", index_col=0)
df = df.dropna()
df.head()

Unnamed: 0,review_rate,review_body
0,10,"Pois bem...as fotos dos pratos, bebidas e doce..."
1,30,Espero que utilizem essa avaliação para rever ...
2,20,"Fomos comer a sobremesa as 20h40, sentamos, pe..."
3,10,Pedimos o cardápio e ao chamar o atendente ped...
4,50,"Fui com algumas amigas em uma segunda-feira, d..."


In [0]:
# Remove URL, RT, mention(@)

df.review_body = df.review_body.str.replace(r'http(\S)+', r'')
df.review_body = df.review_body.str.replace(r'http ...', r'')
df.review_body = df.review_body.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
df.review_body = df.review_body.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
df.review_body = [''.join([i if ord(i) < 128 else '' for i in review_body]) for review_body in df.review_body]
df.review_body = df.review_body.str.replace(r'_[\S]?',r'')

# Remove extra space
df.review_body = df.review_body.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
df.review_body = df.review_body.str.replace(r'&amp;?',r'and')
df.review_body = df.review_body.str.replace(r'&lt;',r'<')
df.review_body = df.review_body.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
df.review_body = df.review_body.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
df.review_body = df.review_body.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
df.review_body = df.review_body.str.lower()
df.review_body = df.review_body.str.strip()

In [7]:

df['text_length'] = [len(review_body.split(' ')) for review_body in df.review_body]
print(df.shape)

(7672, 3)


In [8]:


# Drop texts with length <=3 and drop duplicates
df = df[df['text_length']>3]
df = df.drop_duplicates(subset=['review_body'])

print(df.shape)



(6992, 3)


In [9]:
df.shape[0]

6992

In [10]:
df.review_rate.value_counts()

50    4012
40    2234
30     514
20     144
10      88
Name: review_rate, dtype: int64

In [11]:
df['BERT_processed_text'] = '[CLS] '+df.review_body
df.BERT_processed_text

0       [CLS] pois bem ... as fotos dos pratos , bebid...
1       [CLS] espero que utilizem essa avaliao para re...
2       [CLS] fomos comer a sobremesa as 20h40 , senta...
3       [CLS] pedimos o cardpio e ao chamar o atendent...
4       [CLS] fui com algumas amigas em uma segunda - ...
                              ...                        
7667    [CLS] com amigos , sozinho ou em famlia achei ...
7668    [CLS] pratos rabes muito bons e variados ! tim...
7669    [CLS] bem ao estilo arabe , o dono  muito quer...
7670    [CLS] zaki  uma espcie de restaurante e loja d...
7671    [CLS] adoro ir ao zaki , primeiro por ser tipi...
Name: BERT_processed_text, Length: 6992, dtype: object

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sent)) for sent in df.review_body]

In [13]:
df.BERT_processed_text_length

0       486
1       155
2       224
3       163
4       116
       ... 
7667     42
7668     32
7669     56
7670    109
7671    113
Name: BERT_processed_text_length, Length: 6992, dtype: int64

In [0]:
label_dict = dict()
for i, l in enumerate(list(df.review_rate.value_counts().keys())):
    label_dict.update({l: i})

df['review_rate_label'] = [label_dict[label] for label in df.review_rate]

In [15]:
df.review_rate_label

0       4
1       2
2       3
3       4
4       0
       ..
7667    0
7668    0
7669    1
7670    0
7671    0
Name: review_rate_label, Length: 6992, dtype: int64

In [16]:
!ls /content/drive/My\ Drive/Colab\ Notebooks
df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_processed_reviews.csv')

 ANTON_BERT.ipynb
 ANTON_EGO.ipynb
 ANTON_FINAL.ipynb
 ANTON_WITH_BERT_AGORA_VAI.ipynb
 bert_processed_reviews.csv
 bert_processed_twitter_airline_sentiment.csv
 BertPyTorch.ipynb
 BertTensorflow.ipynb
 BERT_WORD2VEC.ipynb
'Copy of BertPyTorch.ipynb'
 st-sentiment_bert-base-uncased_model.bin
 st-sentiment_bert-base-uncased_model.bin.optim
 TensorflowBasics.ipynb
 teste.ipynb


In [17]:
!ls /content/drive/My\ Drive/Colab\ Notebooks

 ANTON_BERT.ipynb
 ANTON_EGO.ipynb
 ANTON_FINAL.ipynb
 ANTON_WITH_BERT_AGORA_VAI.ipynb
 bert_processed_reviews.csv
 bert_processed_twitter_airline_sentiment.csv
 BertPyTorch.ipynb
 BertTensorflow.ipynb
 BERT_WORD2VEC.ipynb
'Copy of BertPyTorch.ipynb'
 st-sentiment_bert-base-uncased_model.bin
 st-sentiment_bert-base-uncased_model.bin.optim
 TensorflowBasics.ipynb
 teste.ipynb


In [0]:


from sklearn.model_selection import train_test_split



In [0]:
label_names = ['5', '4', '3', '2', '1']
model_name = 'r-sentiment'
device = torch.device("cuda:0")
bert_size = 'bert-base-multilingual-uncased'

train_batch_size = 32 # batch size
clip_grad = 1.0 # gradient clipping
log_every = 10 # number of mini-batches before logging
max_epoch = 100 # max number of epochs
max_patience = 3 # number of iterations to wait before decaying learning rate
max_num_trial = 3 # number of trials before terminating training
lr_decay = 0.5 # learning rate decay
lr_bert = 0.00002 # BERT learning rate
lr = 0.001 # learning rate
valid_niter = 500 # perform validation after n iterations
dropout = 0.3 # dropout rate
verbose = True

prefix = model_name + '_' + bert_size
model_save_path = pwd + '/My Drive/Colab Notebooks/' + prefix+'_model.bin'

In [20]:
training_data,validation_data = train_test_split(df,test_size=0.2,random_state=42)
print(len(df), len(training_data), len(validation_data))

6992 5593 1399


In [21]:
print(training_data)

      review_rate  ... review_rate_label
4857           50  ...                 0
6664           50  ...                 0
6070           40  ...                 1
3311           40  ...                 1
7255           50  ...                 0
...           ...  ...               ...
3942           40  ...                 1
5561           50  ...                 0
5596           40  ...                 1
5760           40  ...                 1
860            50  ...                 0

[5593 rows x 6 columns]


In [22]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

train_label = dict(training_data.review_rate_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label))], device=device)

pp.pprint(train_label_weight)

tensor([ 1.0000,  1.8400,  8.1303, 27.4915, 47.0145], device='cuda:0')


In [23]:
# Set up model and optimizer
import time
start_time = time.time()

model = SentimentClassifierModel(bert_size, device, len(label_names))
optimizer = AdamW([
        {'params': model.bert.bert.parameters()},
        {'params': model.bert.classifier.parameters(), 'lr': float(lr)}
    ], lr=float(lr_bert))

model = model.to(device)
print('Use device: %s' % device, file=sys.stderr)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)

Use device: cuda:0
Done! time elapsed 5.68 sec
--------------------------------------------------------------------------------


In [0]:

#Util functions for training
import math
import logging
import pickle
import numpy as np
import torch
import pandas as pd
import sys
from docopt import docopt
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
    f1_score, precision_score, recall_score, roc_auc_score

import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

def batch_iter(data, batch_size, shuffle=False, bert=None):
    """ Yield batches of sentences and labels reverse sorted by length (largest to smallest).
    @param data (dataframe): dataframe with ProcessedText (str) and label (int) columns
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    @param bert (str): whether for BERT training. Values: "large", "base", None
    """
    batch_num = math.ceil(data.shape[0] / batch_size)
    index_array = list(range(data.shape[0]))

    if shuffle:
        data = data.sample(frac=1)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = data.iloc[indices].sort_values(by='BERT_processed_text_length', ascending=False)
        sents = list(examples.BERT_processed_text)

        targets = list(examples.review_rate_label.values)
        yield sents, targets  # list[list[str]] if not bert else list[str], list[int]
        
def validation(model, df_val, bert_size, loss_func, device):
    """ validation of model during training.
    @param model (nn.Module): the model being trained
    @param df_val (dataframe): validation dataset
    @param bert_size (str): large or base
    @param loss_func(nn.Module): loss function
    @param device (torch.device)
    @return avg loss value across validation dataset
    """
    was_training = model.training
    model.eval()

    df_val = df_val.sort_values(by='BERT_processed_text_length', ascending=False)

    ProcessedText_BERT = list(df_val.BERT_processed_text)
    InformationType_label = list(df_val.review_rate_label)

    val_batch_size = 32

    n_batch = int(np.ceil(df_val.shape[0]/val_batch_size))

    total_loss = 0.

    with torch.no_grad():
        for i in range(n_batch):
            sents = ProcessedText_BERT[i*val_batch_size: (i+1)*val_batch_size]
            targets = torch.tensor(InformationType_label[i*val_batch_size: (i+1)*val_batch_size],
                                   dtype=torch.long, device=device)
            batch_size = len(sents)
            pre_softmax = model(sents)[0]
            batch_loss = loss_func(pre_softmax, targets)
            total_loss += batch_loss.item()*batch_size

    if was_training:
        model.train()

    return total_loss/df_val.shape[0]

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [0]:

# Train

model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')
torch.save(cn_loss, 'loss_func')  # for later testing

# Initialize training variables
num_trial = 0
train_iter = 0
patience = 0
cum_loss = 0
report_loss = 0
cum_examples = report_examples = epoch = 0
hist_valid_scores = []

In [26]:
!ls

drive  loss_func  sample_data


In [27]:
import time

train_time = begin_time = time.time()
print('Begin Maximum Likelihood training...')

# Training loop
while True:
    epoch += 1
    for sents, targets in batch_iter(training_data, batch_size=train_batch_size, shuffle=True, bert='base'):  # for each epoch
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(sents)
        pre_softmax = model(sents)[0]

        # Calculate loss and gradient function
        loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
        loss.backward()

        # Next step
        optimizer.step()

        batch_losses_val = loss.item() * batch_size
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        report_examples += batch_size
        cum_examples += batch_size

        if train_iter % log_every == 0:
            print('epoch %d, iter %d, avg. loss %.2f, '
                  'cum. examples %d, speed %.2f examples/sec, '
                  'time elapsed %.2f sec' % (epoch, train_iter,
                     report_loss / report_examples,
                     cum_examples,
                     report_examples / (time.time() - train_time),
                     time.time() - begin_time), file=sys.stderr)

            train_time = time.time()
            report_loss = report_examples = 0.

        # perform validation
        if train_iter % valid_niter == 0:
            print('epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter,
                 cum_loss / cum_examples,
                 cum_examples), file=sys.stderr)

            cum_loss = cum_examples = 0.

            print('begin validation ...', file=sys.stderr)

            validation_loss = validation(model, validation_data, bert_size, cn_loss, device)   # dev batch size can be a bit larger

            print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr)

            is_better = len(hist_valid_scores) == 0 or validation_loss < min(hist_valid_scores)
            hist_valid_scores.append(validation_loss)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)

                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < int(max_patience):
                patience += 1
                print('hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('hit #%d trial' % num_trial, file=sys.stderr)
                    if num_trial == max_num_trial:
                        print('early stop!', file=sys.stderr)
                        exit(0)

                    # decay lr, and restore from previously best checkpoint
                    print('load previously best model and decay learning rate to %f%%' %
                          (float(lr_decay)*100), file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] *= float(lr_decay)

                    # reset patience
                    patience = 0

            if epoch == int(max_epoch):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)

Begin Maximum Likelihood training...


epoch 1, iter 10, avg. loss 1.66, cum. examples 320, speed 55.08 examples/sec, time elapsed 5.81 sec


RuntimeError: ignored