### This NB uses BERT for Tweet Sentiment Classification

### BERT Sci-Kit Learn Transformer

#### This is general BERT estimator can be used with any classifier 
##### Read about BERT
- https://towardsdatascience.com/build-a-bert-sci-kit-transformer-59d60ddd54a5
- http://jalammar.github.io/illustrated-bert/
- https://jalammar.github.io/illustrated-transformer/
#### Note that Hugging Face wrapper for BERT is used - https://github.com/huggingface/transformers

In [1]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

### Training on the dataset
- Dataset is freely available on the internet
- I have used a sample of dataset, since training on local machine. Hence, most of classes have been left out.

#### randomly split the data into a 70% train set, a 15% validation set, and a 15% test set

In [38]:
import numpy as np
figure8_df = pd.read_csv("~/Documents/personal-docs/ML/ML-AI/BERT Tweet Emotion Classification/trainingandtestdata/training.csv", encoding='latin-1', header=None,
                        names = ['sentiment', 'tweet_id', 'date', 'query', 'handle', 'tweet'])
figure8_df =figure8_df.sample(n=50000)
split = np.random.choice(
    ["train", "val", "test"],
    size=figure8_df.shape[0],
    p=[.7, .15, .15]
)
figure8_df["split"] = split
x_train = figure8_df[figure8_df["split"] == "train"]
y_train = x_train["sentiment"]

In [43]:
x_train["sentiment"].value_counts()

4    17643
0    17519
Name: sentiment, dtype: int64

In [44]:
print(figure8_df.info())
figure8_df.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 1208573 to 63394
Data columns (total 7 columns):
sentiment    50000 non-null int64
tweet_id     50000 non-null int64
date         50000 non-null object
query        50000 non-null object
handle       50000 non-null object
tweet        50000 non-null object
split        50000 non-null object
dtypes: int64(2), object(5)
memory usage: 3.1+ MB
None


Unnamed: 0,sentiment,tweet_id,date,query,handle,tweet,split
933763,4,1792072648,Wed May 13 22:01:44 PDT 2009,NO_QUERY,Cabutter,so much for the mavs... atleast the rangers ar...,train
199125,0,1971365182,Sat May 30 06:56:01 PDT 2009,NO_QUERY,matthewmoloney,League 2 club Accrington Stanley face a High C...,train
1540101,4,2180303096,Mon Jun 15 10:17:09 PDT 2009,NO_QUERY,malteser989,@chrishasboobs Love that so true ! x x,train
1166661,4,1979959506,Sun May 31 05:10:32 PDT 2009,NO_QUERY,Andreafancinell,@Manda_Beth http://twitpic.com/6bow9 - Nice o...,train
316553,0,2002429034,Tue Jun 02 04:11:12 PDT 2009,NO_QUERY,jkc_baybee,"You are not alone tonight, Imagine me there by...",val


### Training the Model

In [31]:
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

I0517 14:51:42.616623 4636470720 file_utils.py:39] PyTorch version 1.2.0 available.
I0517 14:51:45.945072 4636470720 file_utils.py:55] TensorFlow version 2.1.0 available.
I0517 14:51:46.907013 4636470720 _import_c_extension.py:31] Failed to import cuda module: No module named 'caffe2.python.caffe2_pybind11_state_gpu'
I0517 14:51:46.907732 4636470720 _import_c_extension.py:38] Failed to import AMD hip module: No module named 'caffe2.python.caffe2_pybind11_state_hip'
W0517 14:51:46.908506 4636470720 _import_c_extension.py:41] This caffe2 python run does not have GPU support. Will run in CPU only mode.
I0517 14:51:48.406152 4636470720 filelock.py:274] Lock 5163107552 acquired on /Users/tausif/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
I0517 14:51:48.407774 4636470720 file_utils.py:436] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt not foun

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…

I0517 14:51:51.089857 4636470720 file_utils.py:440] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt in cache at /Users/tausif/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0517 14:51:51.090798 4636470720 file_utils.py:443] creating metadata file for /Users/tausif/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0517 14:51:51.092385 4636470720 filelock.py:318] Lock 5163107552 released on /Users/tausif/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
I0517 14:51:51.093023 4636470720 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/t




I0517 14:51:54.635676 4636470720 filelock.py:274] Lock 6214863336 acquired on /Users/tausif/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517.lock
I0517 14:51:54.636723 4636470720 file_utils.py:436] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json not found in cache or force_download set to True, downloading to /Users/tausif/.cache/torch/transformers/tmp3xrr0bw9


HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…

I0517 14:51:56.668742 4636470720 file_utils.py:440] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json in cache at /Users/tausif/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0517 14:51:56.669717 4636470720 file_utils.py:443] creating metadata file for /Users/tausif/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0517 14:51:56.671215 4636470720 filelock.py:318] Lock 6214863336 released on /Users/tausif/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517.lock
I0517 14:51:56.672055 4636470720 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from




I0517 14:51:57.899841 4636470720 filelock.py:274] Lock 6217377720 acquired on /Users/tausif/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.lock
I0517 14:51:57.901114 4636470720 file_utils.py:436] https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin not found in cache or force_download set to True, downloading to /Users/tausif/.cache/torch/transformers/tmpi9ax8u11


HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…

I0517 14:57:56.132760 4636470720 file_utils.py:440] storing https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin in cache at /Users/tausif/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
I0517 14:57:56.134145 4636470720 file_utils.py:443] creating metadata file for /Users/tausif/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
I0517 14:57:56.136100 4636470720 filelock.py:318] Lock 6217377720 released on /Users/tausif/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.lock
I0517 14:57:56.137211 4636470720 modeling_utils.py:617] loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /Users/tausif/.cache/torch/transformers




In [37]:
y_train

14       0
15       0
17       0
18       0
21       0
        ..
49993    0
49994    0
49995    0
49997    0
49998    0
Name: sentiment, Length: 34978, dtype: int64

In [45]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
bert_transformer = BertTransformer(tokenizer, bert_model)
classifier = LinearSVC(C=1.0, class_weight="balanced")
model = Pipeline(
    [
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
    ]
)
model.fit(x_train["tweet"], y_train)



Pipeline(memory=None,
     steps=[('vectorizer', BertTransformer(bert_model=None, bert_tokenizer=None,
        embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x1e85a0f28>,
        max_length=60)), ('classifier', LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

##### If Tf-Idf is also needed to be used as feature
- Notice the *union* function in the pipeline

In [None]:
from sklearn.feature_extraction.text import (
   CountVectorizer, TfidfTransformer
)

tf_idf = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer())
    ])

model = Pipeline([
    ("union", FeatureUnion(transformer_list=[
        ("bert", bert_transformer),
        ("tf_idf", tf_idf)
        ])),
        ("classifier", classifier),
    ])

In [59]:
X_val = figure8_df[figure8_df["split"] == "val"]
y_val = X_val["sentiment"]
y_score = model.predict(X_val["tweet"])

##### Metric Evaluation
- Confusion Matrix
- Precision/Recall

In [101]:
# Print the confusion matrix
print(metrics.confusion_matrix(y_val, y_score))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_val, y_score, digits=3))

[[2987  733]
 [ 769 2880]]
              precision    recall  f1-score   support

           0      0.795     0.803     0.799      3720
           4      0.797     0.789     0.793      3649

   micro avg      0.796     0.796     0.796      7369
   macro avg      0.796     0.796     0.796      7369
weighted avg      0.796     0.796     0.796      7369



In [107]:
from sklearn.metrics import precision_recall_curve, roc_curve
# precision recall curve
precision = dict()
recall = dict()
for i in [0,4]:
    precision[i], recall[i], _ = precision_recall_curve(y_val[:,i],
                                                        y_score[:, i],pos_label=4)
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))

plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()

ValueError: Can only tuple-index with a MultiIndex

In [94]:
model.predict(['@afghanheadspin nope  night owl over the weekend haha slept during the day.. raved n djed in the night'])

array([4])