In [0]:
#https://github.com/prestonlimlianjie/bert-sentiment-analysis-straits-times/blob/master/BERT_sentiment_analysis.ipynb
import sys

In [3]:
!{sys.executable} -m pip install torch transformers pandas scikit-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 28.8MB/s eta 0:00:01[K     |█▏                              | 20kB 2.1MB/s eta 0:00:01[K     |█▊                              | 30kB 2.7MB/s eta 0:00:01[K     |██▎                             | 40kB 2.1MB/s eta 0:00:01[K     |███                             | 51kB 2.3MB/s eta 0:00:01[K     |███▌                            | 61kB 2.7MB/s eta 0:00:01[K     |████                            | 71kB 3.0MB/s eta 0:00:01[K     |████▋                           | 81kB 3.2MB/s eta 0:00:01[K     |█████▎                          | 92kB 3.6MB/s eta 0:00:01[K     |█████▉                          | 102kB 3.4MB/s eta 0:00:01[K     |██████▍                         | 112kB 3.4MB/s eta 0:00:01[K     |███████                         | 122kB 3.4M

In [0]:
# Define utils functions

def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[int]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (int): padding token
    @returns sents_padded (list[list[int]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
        Output shape: (batch_size, max_sentence_length)
    """
    sents_padded = []

    max_len = max(len(s) for s in sents)
    batch_size = len(sents)

    for s in sents:
        padded = [pad_token] * max_len
        padded[:len(s)] = s
        sents_padded.append(padded)

    return sents_padded

def sents_to_tensor(tokenizer, sents, device):
    """
    :param tokenizer: BertTokenizer
    :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences), reversely sorted
    :param device: torch.device
    :return: sents_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: masks_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: sents_lengths: torch.Tensor, shape(batch_size), reversely sorted
    """
    tokens_list = [tokenizer.tokenize(sent) for sent in sents]
    sents_lengths = [len(tokens) for tokens in tokens_list]
    # tokens_sents_zip = zip(tokens_list, sents_lengths)
    # tokens_sents_zip = sorted(tokens_sents_zip, key=lambda x: x[1], reverse=True)
    # tokens_list, sents_lengths = zip(*tokens_sents_zip)
    tokens_list_padded = pad_sents(tokens_list, '[PAD]')
    sents_lengths = torch.tensor(sents_lengths, device=device)

    masks = []
    for tokens in tokens_list_padded:
        mask = [0 if token=='[PAD]' else 1 for token in tokens]
        masks.append(mask)
    masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
    tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
    sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)

    return sents_tensor, masks_tensor, sents_lengths

In [0]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch import nn
import torch.nn.functional as F

In [0]:
class SentimentClassifierModel(nn.Module):

    def __init__(self, bert_config, device, n_class):
        """
        :param bert_config: str, BERT configuration description
        :param device: torch.device
        :param n_class: int
        """

        super(SentimentClassifierModel, self).__init__()

        self.n_class = n_class
        self.bert_config = bert_config
        self.bert = BertForSequenceClassification.from_pretrained(self.bert_config, num_labels=self.n_class)
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_config)
        self.device = device

    def forward(self, sents):
        """
        :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences)
        :return: pre_softmax, torch.tensor of shape (batch_size, n_class)
        """

        sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
        pre_softmax = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)

        return pre_softmax

    @staticmethod
    def load(model_path: str, device):
        """ Load the model from a file.
        @param model_path (str): path to model
        @return model (nn.Module): model with saved parameters
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = SentimentClassifierModel(device=device, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the model to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(bert_config=self.bert_config, n_class=self.n_class),
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

In [10]:
import pandas

pwd = '/content/drive'


df= pandas.read_csv("/content/drive/My Drive/BERT/Tweets.csv", index_col=0, usecols=['tweet_id','airline_sentiment', 'text'])
df.head()

Unnamed: 0_level_0,airline_sentiment,text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
570306133677760513,neutral,@VirginAmerica What @dhepburn said.
570301130888122368,positive,@VirginAmerica plus you've added commercials t...
570301083672813571,neutral,@VirginAmerica I didn't today... Must mean I n...
570301031407624196,negative,@VirginAmerica it's really aggressive to blast...
570300817074462722,negative,@VirginAmerica and it's a really big bad thing...


In [0]:
# Remove URL, RT, mention(@)

df.text = df.text.str.replace(r'http(\S)+', r'')
df.text = df.text.str.replace(r'http ...', r'')
df.text = df.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
df.text = df.text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
df.text = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df.text]
df.text = df.text.str.replace(r'_[\S]?',r'')

# Remove extra space
df.text = df.text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
df.text = df.text.str.replace(r'&amp;?',r'and')
df.text = df.text.str.replace(r'&lt;',r'<')
df.text = df.text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
df.text = df.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
df.text = df.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
df.text = df.text.str.lower()
df.text = df.text.str.strip()

In [12]:
df['text_length'] = [len(text.split(' ')) for text in df.text]
print(df.shape)

(14640, 3)


In [13]:


# Drop texts with length <=3 and drop duplicates
df = df[df['text_length']>3]
df = df.drop_duplicates(subset=['text'])

print(df.shape)



(13977, 3)


In [14]:
# Summary of sample size and labels
df.shape[0]

13977

In [15]:
df.airline_sentiment.value_counts()

negative    8998
neutral     2834
positive    2145
Name: airline_sentiment, dtype: int64

In [16]:
df['BERT_processed_text'] = '[CLS] '+df.text
df.BERT_processed_text

tweet_id
570306133677760513                                   [CLS] what  said .
570301130888122368    [CLS] plus you ' ve added commercials to the e...
570301083672813571    [CLS] i didn ' t today ... must mean i need to...
570301031407624196    [CLS] it ' s really aggressive to blast obnoxi...
570300817074462722     [CLS] and it ' s a really big bad thing about it
                                            ...                        
569587686496825344    [CLS] thank you we got on a different flight t...
569587371693355008    [CLS] leaving over 20 minutes late flight . no...
569587242672398336    [CLS] please bring american airlines to # blac...
569587188687634433    [CLS] you have my money , you change my flight...
569587140490866689    [CLS] we have 8 ppl so we need 2 know how many...
Name: BERT_processed_text, Length: 13977, dtype: object

In [0]:


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sent)) for sent in df.text]



In [18]:
df.BERT_processed_text_length

tweet_id
570306133677760513     3
570301130888122368    15
570301083672813571    17
570301031407624196    25
570300817074462722    11
                      ..
569587686496825344    11
569587371693355008    27
569587242672398336     8
569587188687634433    29
569587140490866689    34
Name: BERT_processed_text_length, Length: 13977, dtype: int64

In [0]:
label_dict = dict()
for i, l in enumerate(list(df.airline_sentiment.value_counts().keys())):
    label_dict.update({l: i})

df['airline_sentiment_label'] = [label_dict[label] for label in df.airline_sentiment]

In [20]:
df.airline_sentiment_label

tweet_id
570306133677760513    1
570301130888122368    2
570301083672813571    1
570301031407624196    0
570300817074462722    0
                     ..
569587686496825344    2
569587371693355008    0
569587242672398336    1
569587188687634433    0
569587140490866689    1
Name: airline_sentiment_label, Length: 13977, dtype: int64

In [21]:
# !touch /content/gdrive/My\ Drive/Colab\ Notebooks/bert_processed_twitter_airline_sentiment.csv
!ls /content/drive/My\ Drive/Colab\ Notebooks
df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_processed_twitter_airline_sentiment.csv')

 ANTON_BERT.ipynb
 ANTON_EGO.ipynb
 ANTON_WITH_BERT_AGORA_VAI.ipynb
 bert_processed_twitter_airline_sentiment.csv
 BertPyTorch.ipynb
 BertTensorflow.ipynb
 BERT_WORD2VEC.ipynb
'Copy of BertPyTorch.ipynb'
 st-sentiment_bert-base-uncased_model.bin
 st-sentiment_bert-base-uncased_model.bin.optim
 TensorflowBasics.ipynb
 teste.ipynb


In [22]:
!ls /content/drive/My\ Drive/Colab\ Notebooks

 ANTON_BERT.ipynb
 ANTON_EGO.ipynb
 ANTON_WITH_BERT_AGORA_VAI.ipynb
 bert_processed_twitter_airline_sentiment.csv
 BertPyTorch.ipynb
 BertTensorflow.ipynb
 BERT_WORD2VEC.ipynb
'Copy of BertPyTorch.ipynb'
 st-sentiment_bert-base-uncased_model.bin
 st-sentiment_bert-base-uncased_model.bin.optim
 TensorflowBasics.ipynb
 teste.ipynb


In [0]:
from sklearn.model_selection import train_test_split

In [0]:
# Define training params
label_names = ['positive', 'negative', 'neutral']
model_name = 'st-sentiment'
device = torch.device("cuda:0")
bert_size = 'bert-base-uncased'

train_batch_size = 32 # batch size
clip_grad = 1.0 # gradient clipping
log_every = 10 # number of mini-batches before logging
max_epoch = 100 # max number of epochs
max_patience = 3 # number of iterations to wait before decaying learning rate
max_num_trial = 3 # number of trials before terminating training
lr_decay = 0.5 # learning rate decay
lr_bert = 0.00002 # BERT learning rate
lr = 0.001 # learning rate
valid_niter = 500 # perform validation after n iterations
dropout = 0.3 # dropout rate
verbose = True

prefix = model_name + '_' + bert_size
model_save_path = pwd + '/My Drive/Colab Notebooks/' + prefix+'_model.bin'

In [25]:
training_data,validation_data = train_test_split(df,test_size=0.2,random_state=42)
print(len(df), len(training_data), len(validation_data))

13977 11181 2796


In [26]:
print(training_data)

                   airline_sentiment  ... airline_sentiment_label
tweet_id                              ...                        
569593278636675072          negative  ...                       0
568621033273602048          positive  ...                       2
569786809028255744          negative  ...                       0
569673900805783552          negative  ...                       0
568809510644527104          negative  ...                       0
...                              ...  ...                     ...
569162467051474944          negative  ...                       0
569671368788172800          negative  ...                       0
568885499986874369           neutral  ...                       1
570027321178099712           neutral  ...                       1
569530159247826944          positive  ...                       2

[11181 rows x 6 columns]


In [27]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

train_label = dict(training_data.airline_sentiment_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label))], device=device)

pp.pprint(train_label_weight)

tensor([1.0000, 3.2735, 4.2780], device='cuda:0')


In [28]:
# Set up model and optimizer
import time
start_time = time.time()

model = SentimentClassifierModel(bert_size, device, len(label_names))
optimizer = AdamW([
        {'params': model.bert.bert.parameters()},
        {'params': model.bert.classifier.parameters(), 'lr': float(lr)}
    ], lr=float(lr_bert))

model = model.to(device)
print('Use device: %s' % device, file=sys.stderr)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)

Use device: cuda:0
Done! time elapsed 3.26 sec
--------------------------------------------------------------------------------


In [0]:
# Util functions for training
import math
import logging
import pickle
import numpy as np
import torch
import pandas as pd
import sys
from docopt import docopt
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
    f1_score, precision_score, recall_score, roc_auc_score

import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

def batch_iter(data, batch_size, shuffle=False, bert=None):
    """ Yield batches of sentences and labels reverse sorted by length (largest to smallest).
    @param data (dataframe): dataframe with ProcessedText (str) and label (int) columns
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    @param bert (str): whether for BERT training. Values: "large", "base", None
    """
    batch_num = math.ceil(data.shape[0] / batch_size)
    index_array = list(range(data.shape[0]))

    if shuffle:
        data = data.sample(frac=1)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = data.iloc[indices].sort_values(by='BERT_processed_text_length', ascending=False)
        sents = list(examples.BERT_processed_text)

        targets = list(examples.airline_sentiment_label.values)
        yield sents, targets  # list[list[str]] if not bert else list[str], list[int]
        
def validation(model, df_val, bert_size, loss_func, device):
    """ validation of model during training.
    @param model (nn.Module): the model being trained
    @param df_val (dataframe): validation dataset
    @param bert_size (str): large or base
    @param loss_func(nn.Module): loss function
    @param device (torch.device)
    @return avg loss value across validation dataset
    """
    was_training = model.training
    model.eval()

    df_val = df_val.sort_values(by='BERT_processed_text_length', ascending=False)

    ProcessedText_BERT = list(df_val.BERT_processed_text)
    InformationType_label = list(df_val.airline_sentiment_label)

    val_batch_size = 32

    n_batch = int(np.ceil(df_val.shape[0]/val_batch_size))

    total_loss = 0.

    with torch.no_grad():
        for i in range(n_batch):
            sents = ProcessedText_BERT[i*val_batch_size: (i+1)*val_batch_size]
            targets = torch.tensor(InformationType_label[i*val_batch_size: (i+1)*val_batch_size],
                                   dtype=torch.long, device=device)
            batch_size = len(sents)
            pre_softmax = model(sents)[0]
            batch_loss = loss_func(pre_softmax, targets)
            total_loss += batch_loss.item()*batch_size

    if was_training:
        model.train()

    return total_loss/df_val.shape[0]

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [0]:


# Train

model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')
torch.save(cn_loss, 'loss_func')  # for later testing

# Initialize training variables
num_trial = 0
train_iter = 0
patience = 0
cum_loss = 0
report_loss = 0
cum_examples = report_examples = epoch = 0
hist_valid_scores = []



In [28]:
! ls

drive  loss_func  sample_data


In [0]:
import time

train_time = begin_time = time.time()
print('Begin Maximum Likelihood training...')

# Training loop
while True:
    epoch += 1
    for sents, targets in batch_iter(training_data, batch_size=train_batch_size, shuffle=True, bert='base'):  # for each epoch
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(sents)
        pre_softmax = model(sents)[0]

        # Calculate loss and gradient function
        loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
        loss.backward()

        # Next step
        optimizer.step()

        batch_losses_val = loss.item() * batch_size
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        report_examples += batch_size
        cum_examples += batch_size

        if train_iter % log_every == 0:
            print('epoch %d, iter %d, avg. loss %.2f, '
                  'cum. examples %d, speed %.2f examples/sec, '
                  'time elapsed %.2f sec' % (epoch, train_iter,
                     report_loss / report_examples,
                     cum_examples,
                     report_examples / (time.time() - train_time),
                     time.time() - begin_time), file=sys.stderr)

            train_time = time.time()
            report_loss = report_examples = 0.

        # perform validation
        if train_iter % valid_niter == 0:
            print('epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter,
                 cum_loss / cum_examples,
                 cum_examples), file=sys.stderr)

            cum_loss = cum_examples = 0.

            print('begin validation ...', file=sys.stderr)

            validation_loss = validation(model, validation_data, bert_size, cn_loss, device)   # dev batch size can be a bit larger

            print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr)

            is_better = len(hist_valid_scores) == 0 or validation_loss < min(hist_valid_scores)
            hist_valid_scores.append(validation_loss)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)

                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < int(max_patience):
                patience += 1
                print('hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('hit #%d trial' % num_trial, file=sys.stderr)
                    if num_trial == max_num_trial:
                        print('early stop!', file=sys.stderr)
                        exit(0)

                    # decay lr, and restore from previously best checkpoint
                    print('load previously best model and decay learning rate to %f%%' %
                          (float(lr_decay)*100), file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] *= float(lr_decay)

                    # reset patience
                    patience = 0

            if epoch == int(max_epoch):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)

epoch 139, iter 48540, avg. loss 0.09, cum. examples 1280, speed 213.92 examples/sec, time elapsed 7623.30 sec
epoch 139, iter 48550, avg. loss 0.08, cum. examples 1600, speed 216.04 examples/sec, time elapsed 7624.79 sec
epoch 139, iter 48560, avg. loss 0.14, cum. examples 1920, speed 211.66 examples/sec, time elapsed 7626.30 sec
epoch 139, iter 48570, avg. loss 0.13, cum. examples 2240, speed 219.14 examples/sec, time elapsed 7627.76 sec
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-40a22b790974>", line 17, in <module>
    loss.backward()
  File "/usr/local/lib/python3.6/dist-packages/torch/tensor.py", line 195, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py", line 99, in backward
    allow_unreachable=True)  # allow_unreachable flag
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occ

KeyboardInterrupt: ignored

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 438, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 421, in execute_request
    self._abort_queues()
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 636, in _abort_queues
    self._abort_queue(stream)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 657, in _abort_queue
    sel

In [0]:
#validation

In [0]:
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
f1_score, precision_score, recall_score, roc_auc_score
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

In [0]:
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [33]:
print('load best model...')

model = SentimentClassifierModel.load('/content/drive/My Drive/Colab Notebooks/' + prefix + '_model.bin', device)

model.to(device)

model.eval()

df_test = validation_data

df_test = df_test.sort_values(by='BERT_processed_text_length', ascending=False)

test_batch_size = 32

n_batch = int(np.ceil(df_test.shape[0]/test_batch_size))

cn_loss = torch.load('loss_func', map_location=lambda storage, loc: storage).to(device)

ProcessedText_BERT = list(df_test.BERT_processed_text)
InformationType_label = list(df_test.airline_sentiment_label)

test_loss = 0.
prediction = []
prob = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for i in range(n_batch):
        sents = ProcessedText_BERT[i*test_batch_size: (i+1)*test_batch_size]
        targets = torch.tensor(InformationType_label[i * test_batch_size: (i + 1) * test_batch_size],
                                   dtype=torch.long, device=device)
        batch_size = len(sents)

        pre_softmax = model(sents)[0]
        batch_loss = cn_loss(pre_softmax, targets)
        test_loss += batch_loss.item()*batch_size
        prob_batch = softmax(pre_softmax)
        prob.append(prob_batch)

        prediction.extend([t.item() for t in list(torch.argmax(prob_batch, dim=1))])

prob = torch.cat(tuple(prob), dim=0)
loss = test_loss/df_test.shape[0]

pickle.dump([label_names[i] for i in prediction], open(prefix+'_test_prediction', 'wb'))
pickle.dump(prob.data.cpu().numpy(), open(prefix + '_test_prediction_prob', 'wb'))

accuracy = accuracy_score(df_test.airline_sentiment_label.values, prediction)
matthews = matthews_corrcoef(df_test.airline_sentiment_label.values, prediction)

precisions = {}
recalls = {}
f1s = {}
aucrocs = {}

for i in range(len(label_names)):
    prediction_ = [1 if pred == i else 0 for pred in prediction]
    true_ = [1 if label == i else 0 for label in df_test.airline_sentiment_label.values]
    f1s.update({label_names[i]: f1_score(true_, prediction_)})
    precisions.update({label_names[i]: precision_score(true_, prediction_)})
    recalls.update({label_names[i]: recall_score(true_, prediction_)})
    aucrocs.update({label_names[i]: roc_auc_score(true_, list(t.item() for t in prob[:, i]))})

metrics_dict = {'loss': loss, 'accuracy': accuracy, 'matthews coef': matthews, 'precision': precisions,
                         'recall': recalls, 'f1': f1s, 'aucroc': aucrocs}

pickle.dump(metrics_dict, open(prefix+'_evaluation_metrics', 'wb'))

cm = plot_confusion_matrix(list(df_test.airline_sentiment_label.values), prediction, label_names, normalize=False,
                          path=prefix+'_test_confusion_matrix', title='confusion matrix for test dataset')
plt.savefig(prefix+'_test_confusion_matrix', format='png')
cm_norm = plot_confusion_matrix(list(df_test.airline_sentiment_label.values), prediction, label_names, normalize=True,
                          path=prefix+'_test normalized_confusion_matrix', title='normalized confusion matrix for test dataset')
plt.savefig(prefix+'_test_normalized_confusion_matrix', format='png')

print('loss: %.2f' % loss)
print('accuracy: %.2f' % accuracy)
print('matthews coef: %.2f' % matthews)
print('-' * 80)
for i in range(len(label_names)):
    print('precision score for %s: %.2f' % (label_names[i], precisions[label_names[i]]))
    print('recall score for %s: %.2f' % (label_names[i], recalls[label_names[i]]))
    print('f1 score for %s: %.2f' % (label_names[i], f1s[label_names[i]]))
    print('auc roc score for %s: %.2f' % (label_names[i], aucrocs[label_names[i]]))
    print('-' * 80)

load best model...
loss: 0.51
accuracy: 0.82
matthews coef: 0.68
--------------------------------------------------------------------------------
precision score for positive: 0.93
recall score for positive: 0.83
f1 score for positive: 0.88
auc roc score for positive: 0.94
--------------------------------------------------------------------------------
precision score for negative: 0.62
recall score for negative: 0.77
f1 score for negative: 0.69
auc roc score for negative: 0.90
--------------------------------------------------------------------------------
precision score for neutral: 0.76
recall score for neutral: 0.82
f1 score for neutral: 0.79
auc roc score for neutral: 0.96
--------------------------------------------------------------------------------


In [34]:
precisions

{'negative': 0.6245059288537549,
 'neutral': 0.756198347107438,
 'positive': 0.9311010946555055}

In [35]:


recalls



{'negative': 0.7707317073170732,
 'neutral': 0.8187919463087249,
 'positive': 0.8339100346020761}

In [36]:
f1s

{'negative': 0.6899563318777292,
 'neutral': 0.78625134264232,
 'positive': 0.8798296318831762}

In [37]:
aucrocs

{'negative': 0.9044422823870604,
 'neutral': 0.9643572446935865,
 'positive': 0.9388083027605635}

In [38]:
import pandas
st_df = pandas.read_csv("/content/drive/My Drive/Colab Notebooks/st-comments.csv", index_col=0, encoding='latin-1', usecols=['comment_id','post_title', 'comment_text'])
st_df.head()

FileNotFoundError: ignored

In [0]:
# Remove URL, RT, mention(@)

st_df['text'] = st_df.comment_text

st_df.text = st_df.text.str.replace(r'http(\S)+', r'')
st_df.text = st_df.text.str.replace(r'http ...', r'')
st_df.text = st_df.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
st_df.text = st_df.text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
st_df.text = [''.join([i if ord(i) < 128 else '' for i in text]) for text in st_df.text]
st_df.text = st_df.text.str.replace(r'_[\S]?',r'')

# Remove extra space
st_df.text = st_df.text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
st_df.text = st_df.text.str.replace(r'&amp;?',r'and')
st_df.text = st_df.text.str.replace(r'&lt;',r'<')
st_df.text = st_df.text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
st_df.text = st_df.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
st_df.text = st_df.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
st_df.text = st_df.text.str.lower()
st_df.text = st_df.text.str.strip()

st_df['text_length'] = [len(text.split(' ')) for text in st_df.text]
print(st_df.shape)

In [0]:
st_df['BERT_processed_text'] = '[CLS] '+ st_df.text
st_df.BERT_processed_text

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
st_df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sent)) for sent in st_df.text]
st_df.BERT_processed_text_length

In [0]:
st_df

In [0]:
st_df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_processed_st_comments.csv')

In [0]:
# Load model
model = SentimentClassifierModel.load('/content/drive/My Drive/Colab Notebooks/' + prefix + '_model.bin', device)

model.to(device)

In [0]:
st_df = st_df.sort_values(by='BERT_processed_text_length', ascending=False)

In [0]:
st_df

In [0]:
cn_loss = torch.load('loss_func', map_location=lambda storage, loc: storage).to(device)

In [0]:
ProcessedText_BERT = list(st_df.BERT_processed_text)

In [0]:
ProcessedText_BERT

In [0]:
softmax = torch.nn.Softmax(dim=1)

In [0]:
labels = ['negative', 'neutral', 'positive']

In [0]:
sents = ProcessedText_BERT[:2]
sents

In [0]:
len(sents)

In [0]:


pre_softmax = model(sents)[0]
pre_softmax



In [0]:
pre_softmax.shape

In [0]:
prob = softmax(pre_softmax)
prob

In [0]:


prob.shape



In [0]:
prob[0]

In [0]:
label_indexes = [t.item() for t in list(torch.argmax(prob, dim=1))]

In [0]:
prediction = labels[label_indexes[1]]
prediction

In [0]:
predictions = []
with torch.no_grad():
  sents = ProcessedText_BERT
  pre_softmax = model(sents)[0]
  prob = softmax(pre_softmax)
  predictions.extend([t.item() for t in list(torch.argmax(prob, dim=1))])
print(predictions)

In [0]:
[labels[pred_val] for pred_val in predictions]

In [0]:
st_df['predictions'] = [labels[pred_val] for pred_val in predictions]

In [0]:
print(st_df.comment_text[90])
print(st_df.predictions[90])

In [0]:
st_df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_predicted_st_comments.csv')

In [0]:
st_df.predictions.value_counts()