In [48]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import dataset
import vsm
import sst

In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [3]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(2, ds), [twitter_train, twitter_validate, twitter_test]))

In [4]:
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

## Pre-trained BERT

In [5]:
bert_weights_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)

In [6]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [7]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [8]:
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: When was I last outside? I am stuck at home for 2 weeks.
   Tokens: ['When', 'was', 'I', 'last', 'outside', '?', 'I', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.']
Token IDs: [1332, 1108, 146, 1314, 1796, 136, 146, 1821, 5342, 1120, 1313, 1111, 123, 2277, 119]


In [9]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
encoding['input_ids']

tensor([[ 101, 1332, 1108,  146, 1314, 1796,  136,  146, 1821, 5342, 1120, 1313,
         1111,  123, 2277,  119,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])

In [11]:
encoding['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'When',
 'was',
 'I',
 'last',
 'outside',
 '?',
 'I',
 'am',
 'stuck',
 'at',
 'home',
 'for',
 '2',
 'weeks',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [13]:
def label_to_num(label):
    if label == "Positive":
        return 1
    if label == "Neutral":
        return 2
    if label == "Negative":
        return 3
    if label == "Irrelevant":
        return 4

## Batch

In [14]:
batch1 = twitter_train[:1000]
batch1.sentiment.value_counts()

Negative      314
Positive      287
Neutral       227
Irrelevant    172
Name: sentiment, dtype: int64

In [15]:
token_lens = []
for txt in batch1.text:
  tokens = tokenizer.encode(str(txt), max_length=512)
  token_lens.append(len(tokens))

In [16]:
max(token_lens)

176

In [17]:
MAX_LEN = 220

# Transform input to feature matrix

## Tokenize

In [18]:
tokenized = batch1.text.apply(lambda x: tokenizer.encode(str(x), add_special_tokens = True))

## Pad for matrix ops

In [19]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(1000, 176)

## Mask padding

In [20]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1000, 176)

In [21]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

## Run Gradient Descent

In [24]:
%%time
# transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
with torch.no_grad():
    output = model(input_ids, attention_mask=attention_mask)

CPU times: user 12min 38s, sys: 1min 31s, total: 14min 10s
Wall time: 2min 21s


In [25]:
last_hidden_state, pooled_output = output.last_hidden_state, output.pooler_output

In [26]:
features = last_hidden_states[0][:,0,:].numpy()

## Feature Matrix Generated

In [27]:
features

array([[ 4.7202411e-01,  1.2907133e-01,  4.1557610e-02, ...,
        -3.1512013e-01,  5.0587171e-01,  2.4475679e-01],
       [ 1.7508554e-01,  3.2480410e-01, -1.2354153e-01, ...,
        -1.5554804e-01,  2.7715233e-01, -4.4363603e-02],
       [ 3.9038855e-01,  2.0172113e-01,  3.7404778e-04, ...,
        -1.3712682e-01,  1.7696221e-01,  6.4126499e-02],
       ...,
       [ 4.5027325e-01,  3.6462277e-01,  3.2491732e-01, ...,
        -1.6248864e-01,  3.1678092e-01, -2.2206199e-01],
       [ 5.1715469e-01,  2.0905435e-02, -5.8483168e-02, ...,
        -3.0024618e-01,  2.7116129e-01, -6.5983713e-02],
       [ 4.5719910e-01,  3.0957070e-01, -2.1945217e-01, ...,
        -2.2497639e-01,  5.7954282e-01,  8.1879333e-02]], dtype=float32)

In [29]:
labels = batch1.sentiment

# Use BERT Representations with LogisticRegression Softmax Classifier

In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import dataset
import vsm
import sst

In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [3]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(2, ds), [twitter_train, twitter_validate, twitter_test]))

In [4]:
bert_weights_name = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

In [5]:
def fit_softmax_classifier(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='ovr')
    mod.fit(X, y)
    return mod

In [6]:
def hf_cls_phi(text):
    # Get the ids. `vsm.hf_encode` will help; be sure to
    # set `add_special_tokens=True`.
    ##### YOUR CODE HERE
    subtok_ids = vsm.hf_encode(text, bert_tokenizer, add_special_tokens=True)

    # Get the BERT representations. `vsm.hf_represent` will help:
    ##### YOUR CODE HERE
    subtok_reps = vsm.hf_represent(subtok_ids, bert_model, layer=-1)

    # Index into `reps` to get the representation above [CLS].
    # The shape of `reps` should be (1, n, 768), where n is the
    # number of tokens. You need the 0th element of the 2nd dim:
    ##### YOUR CODE HERE
    cls_rep = subtok_reps[0][:][0]

    # These conversions should ensure that you can work with the
    # representations flexibly. Feel free to change the variable
    # name:
    return cls_rep.cpu().numpy()

In [None]:
%%time
bert_experiment = sst.experiment(
    twitter_train,
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate],
    vectorize=False)