In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import dataset
import vsm
import sst

In [None]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [None]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(2, ds), [twitter_train, twitter_validate, twitter_test]))

In [None]:
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

## Pre-trained BERT

In [None]:
bert_weights_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [None]:
encoding['input_ids']

In [None]:
encoding['attention_mask']

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
def label_to_num(label):
    if label == "Positive":
        return 1
    if label == "Neutral":
        return 2
    if label == "Negative":
        return 3
    if label == "Irrelevant":
        return 4

## Batch

In [None]:
batch1 = twitter_train[:1000]
batch1.sentiment.value_counts()

In [None]:
token_lens = []
for txt in batch1.text:
  tokens = tokenizer.encode(str(txt), max_length=512)
  token_lens.append(len(tokens))

In [None]:
max(token_lens)

In [None]:
MAX_LEN = 220

# Transform input to feature matrix

## Tokenize

In [None]:
tokenized = batch1.text.apply(lambda x: tokenizer.encode(str(x), add_special_tokens = True))

## Pad for matrix ops

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

## Mask padding

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

## Run Gradient Descent

In [None]:
%%time
# transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
with torch.no_grad():
    output = model(input_ids, attention_mask=attention_mask)

In [None]:
last_hidden_state, pooled_output = output.last_hidden_state, output.pooler_output

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

## Feature Matrix Generated

In [None]:
features

In [None]:
labels = batch1.sentiment

# Use BERT Representations with LogisticRegression Softmax Classifier

In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import dataset
import vsm
import sst

In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [3]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(2, ds), [twitter_train, twitter_validate, twitter_test]))

In [4]:
bert_weights_name = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

In [5]:
def fit_softmax_classifier(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='ovr')
    mod.fit(X, y)
    return mod

In [6]:
def hf_cls_phi(text):
    # Get the ids. `vsm.hf_encode` will help; be sure to
    # set `add_special_tokens=True`.
    ##### YOUR CODE HERE
    subtok_ids = vsm.hf_encode(text, bert_tokenizer, add_special_tokens=True)

    # Get the BERT representations. `vsm.hf_represent` will help:
    ##### YOUR CODE HERE
    subtok_reps = vsm.hf_represent(subtok_ids, bert_model, layer=-1)

    # Index into `reps` to get the representation above [CLS].
    # The shape of `reps` should be (1, n, 768), where n is the
    # number of tokens. You need the 0th element of the 2nd dim:
    ##### YOUR CODE HERE
    cls_rep = subtok_reps[0][:][0]

    # These conversions should ensure that you can work with the
    # representations flexibly. Feel free to change the variable
    # name:
    return cls_rep.cpu().numpy()

In [7]:
twitter_train.size, twitter_validate.size

(156831, 3000)

In [8]:
%%time
bert_experiment1500 = sst.experiment(
    twitter_train[:1500], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.316     0.209     0.252       172
    Negative      0.554     0.654     0.600       266
     Neutral      0.522     0.460     0.489       285
    Positive      0.536     0.621     0.575       277

    accuracy                          0.513      1000
   macro avg      0.482     0.486     0.479      1000
weighted avg      0.499     0.513     0.502      1000

CPU times: user 14min 46s, sys: 4.84 s, total: 14min 51s
Wall time: 2min 29s


In [9]:
%%time
bert_experiment3000 = sst.experiment(
    twitter_train[:3000], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.405     0.285     0.334       172
    Negative      0.548     0.662     0.600       266
     Neutral      0.531     0.456     0.491       285
    Positive      0.543     0.614     0.576       277

    accuracy                          0.525      1000
   macro avg      0.507     0.504     0.500      1000
weighted avg      0.517     0.525     0.516      1000

CPU times: user 23min 21s, sys: 8.04 s, total: 23min 29s
Wall time: 3min 58s


In [10]:
%%time
bert_experiment6000 = sst.experiment(
    twitter_train[:6000], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1500]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.443     0.273     0.338       172
    Negative      0.571     0.714     0.634       266
     Neutral      0.559     0.467     0.509       285
    Positive      0.567     0.661     0.610       277

    accuracy                          0.553      1000
   macro avg      0.535     0.529     0.523      1000
weighted avg      0.544     0.553     0.541      1000

CPU times: user 40min 21s, sys: 13.5 s, total: 40min 34s
Wall time: 6min 53s


In [11]:
%%time
bert_experiment12000 = sst.experiment(
    twitter_train[:12000], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:2000]],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.520     0.302     0.382       172
    Negative      0.590     0.752     0.661       266
     Neutral      0.571     0.519     0.544       285
    Positive      0.596     0.650     0.622       277

    accuracy                          0.580      1000
   macro avg      0.569     0.556     0.552      1000
weighted avg      0.574     0.580     0.569      1000

CPU times: user 1h 14min 46s, sys: 25.6 s, total: 1h 15min 12s
Wall time: 12min 49s


In [12]:
%%time
bert_experiment_full = sst.experiment(
    twitter_train, # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate],
    vectorize=False)

              precision    recall  f1-score   support

  Irrelevant      0.556     0.262     0.356       172
    Negative      0.579     0.756     0.656       266
     Neutral      0.599     0.565     0.581       285
    Positive      0.607     0.664     0.634       277

    accuracy                          0.591      1000
   macro avg      0.585     0.562     0.557      1000
weighted avg      0.588     0.591     0.577      1000

CPU times: user 5h 6min 1s, sys: 1min 47s, total: 5h 7min 49s
Wall time: 52min 56s
