# Use BERT Representations with LogisticRegression Softmax Classifier

In [1]:
from collections import Counter
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification


import dataset
import vsm
import sst

In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [3]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(2, ds), [twitter_train, twitter_validate, twitter_test]))

In [4]:
bert_weights_name = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

In [5]:
def fit_softmax_classifier(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='ovr')
    mod.fit(X, y)
    return mod

In [6]:
def hf_cls_phi(text):
    # Get the ids. `vsm.hf_encode` will help; be sure to
    # set `add_special_tokens=True`.
    ##### YOUR CODE HERE
    subtok_ids = vsm.hf_encode(text, bert_tokenizer, add_special_tokens=True)

    # Get the BERT representations. `vsm.hf_represent` will help:
    ##### YOUR CODE HERE
    subtok_reps = vsm.hf_represent(subtok_ids, bert_model, layer=-1)

    # Index into `reps` to get the representation above [CLS].
    # The shape of `reps` should be (1, n, 768), where n is the
    # number of tokens. You need the 0th element of the 2nd dim:
    ##### YOUR CODE HERE
    cls_rep = subtok_reps[0][:][0]

    # These conversions should ensure that you can work with the
    # representations flexibly. Feel free to change the variable
    # name:
    return cls_rep.cpu().numpy()

In [7]:
twitter_train.size, twitter_validate.size

(156831, 3000)

In [8]:
%%time
bert_experiment1500 = sst.experiment(
    twitter_train[:1500], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.354     0.203     0.258       172
    Negative      0.491     0.692     0.574       266
     Neutral      0.528     0.432     0.475       285
    Positive      0.543     0.574     0.558       277

    accuracy                          0.501      1000
   macro avg      0.479     0.475     0.466      1000
weighted avg      0.492     0.501     0.487      1000

CPU times: user 1h 8min 17s, sys: 58.4 s, total: 1h 9min 15s
Wall time: 11min 36s


In [9]:
%%time
bert_experiment3000 = sst.experiment(
    twitter_train[:3000], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.410     0.186     0.256       172
    Negative      0.505     0.692     0.584       266
     Neutral      0.564     0.467     0.511       285
    Positive      0.531     0.617     0.571       277

    accuracy                          0.520      1000
   macro avg      0.503     0.490     0.480      1000
weighted avg      0.513     0.520     0.503      1000

CPU times: user 43min 3s, sys: 24.5 s, total: 43min 28s
Wall time: 7min 18s


In [10]:
%%time
bert_experiment6000 = sst.experiment(
    twitter_train[:6000], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1500]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.440     0.192     0.267       172
    Negative      0.514     0.752     0.611       266
     Neutral      0.551     0.516     0.533       285
    Positive      0.595     0.578     0.586       277

    accuracy                          0.540      1000
   macro avg      0.525     0.509     0.499      1000
weighted avg      0.534     0.540     0.523      1000

CPU times: user 2h 38min 23s, sys: 2min 11s, total: 2h 40min 34s
Wall time: 26min 59s


In [11]:
%%time
bert_experiment12000 = sst.experiment(
    twitter_train[:12000], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:2000]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.455     0.203     0.281       172
    Negative      0.529     0.763     0.625       266
     Neutral      0.591     0.547     0.568       285
    Positive      0.622     0.617     0.620       277

    accuracy                          0.565      1000
   macro avg      0.549     0.533     0.523      1000
weighted avg      0.559     0.565     0.548      1000

CPU times: user 3h 37min 11s, sys: 2min 38s, total: 3h 39min 49s
Wall time: 37min 35s


In [12]:
%%time
bert_experiment_full = sst.experiment(
    twitter_train, # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.589     0.250     0.351       172
    Negative      0.570     0.778     0.658       266
     Neutral      0.604     0.568     0.586       285
    Positive      0.625     0.668     0.646       277

    accuracy                          0.597      1000
   macro avg      0.597     0.566     0.560      1000
weighted avg      0.598     0.597     0.581      1000

CPU times: user 11h 53min 46s, sys: 8min 41s, total: 12h 2min 28s
Wall time: 2h 2min 21s


In [13]:
bert_experiment_full.keys()

dict_keys(['model', 'phi', 'train_dataset', 'assess_datasets', 'predictions', 'metric', 'scores'])

In [14]:
bert_experiment_full['scores']

[0.5602068458394315]

In [15]:
bert_experiment_full['metric']

'safe_macro_f1'

In [16]:
bert_experiment_full['model']

LogisticRegression(multi_class='ovr', solver='liblinear')

# Test BERT trained on Tweets on test set

In [17]:
def predict_one_bert(text):
    # List of tokenized examples:
    X = [bert_experiment_full['phi'](text)]
    # Standard `predict` step on a list of lists of str:
    preds = bert_experiment_full['model'].predict(X)
    # Be sure to return the only member of the predictions,
    # rather than the singleton list:
    return preds[0]

In [18]:
# %% time
# twitter_test['prediction'] = twitter_test['text'].apply(predict_one_bert)

In [21]:
import importlib
importlib.reload(sst)

<module 'sst' from '/mnt/c/Users/echya/Documents/XCS224U - 007 Natural Language Understanding/CS224-final-project/sst.py'>

In [22]:
%%time
bert_test = sst.evaluate(
    bert_experiment_full['model'],
    bert_experiment_full['phi'],
    assess_dataframes=[twitter_test],
    vectorizer=bert_experiment_full['assess_datasets'][0]['vectorizer'],
    vectorize=False
)

              precision    recall  f1-score   support

  Irrelevant      0.481     0.268     0.344      3915
    Negative      0.602     0.715     0.654      6725
     Neutral      0.547     0.537     0.542      5446
    Positive      0.579     0.632     0.604      6319

    accuracy                          0.570     22405
   macro avg      0.552     0.538     0.536     22405
weighted avg      0.561     0.570     0.559     22405

CPU times: user 2h 7min 45s, sys: 51.8 s, total: 2h 8min 37s
Wall time: 21min 32s
