## Set-up imports

In [1]:
# !pip install transformers

In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import BertModel, BertTokenizer

import torch
import logging
import warnings
warnings.filterwarnings('ignore')

## Download and import the data

In [3]:
# !wget https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv --no-check-certificate

In [4]:
DATA_HOME = os.path.join("data", 'train.tsv')

In [5]:
DATA_HOME

'data\\train.tsv'

In [6]:
# df = pd.read_csv(DATA_HOME, delimiter='\t', header=None)
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [7]:
df.shape

(6920, 2)

In [8]:
# Only use 2000 sentences from the dataset, for performance reasons
batch_1 = df[:2000]

In [9]:
batch_1.head(10)

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
5,campanella gets the tone just right funny in t...,1
6,a fan film that for the uninitiated plays bett...,0
7,"b art and berling are both superb , while hupp...",1
8,"a little less extreme than in the past , with ...",0
9,the film is strictly routine,0


We can ask pandas how many sentences are labeled as "positive" (value 1) and how many are labeled "negative" (having the value 0)

In [10]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

## Load the pre-trained BERT models

We are going to use both DistilBert and regular Bert.

In [11]:
# DistilBert
pretrained_weights = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
model = DistilBertModel.from_pretrained(pretrained_weights)

## Prepare the Dataset

### Option 1: Tokenize the sentences

First, we need to tokenize/encode the sentences in `df`.

In [12]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [13]:
tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object

In [14]:
batch_1[0]

0       a stirring , funny and finally transporting re...
1       apparently reassembled from the cutting room f...
2       they presume their audience wo n't sit still f...
3       this is a visually stunning rumination on love...
4       jonathan parker 's bartleby should have been t...
                              ...                        
1995    too bland and fustily tasteful to be truly pru...
1996                           it does n't work as either
1997    this one aims for the toilet and scores a dire...
1998    in the name of an allegedly inspiring and easi...
1999    the movie is undone by a filmmaking methodolog...
Name: 0, Length: 2000, dtype: object

### Option 1: Padding

Here, we pad the sentences to the max length

In [15]:
max_len = 0
for sentence in tokenized.values:
    if len(sentence) > max_len:
        max_len = len(sentence)

padded = np.array([value + [0]*(max_len - len(value)) for value in tokenized.values])

In [16]:
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

### Option 1: Masking

In [17]:
attention_mask = np.where(padded !=0, 1, 0)
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

### Batched tokenization, padding, and masking

In [18]:
batch_tokenized = tokenizer.batch_encode_plus(
    batch_1[0], 
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True)

In [19]:
batch_tokenized.keys()

dict_keys(['input_ids', 'attention_mask'])

In [20]:
batched_input_ids = np.array(batch_tokenized['input_ids'])
batched_attention_mask = np.array(batch_tokenized['attention_mask'])

In [21]:
print(batched_input_ids.shape)
batched_input_ids

(2000, 59)


array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [22]:
print(batched_attention_mask.shape)
batched_attention_mask

(2000, 59)


array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

## Model #1: Get Sentence Embeddings via BERT

The `model()` function runs our sentences through BERT.  The results of the processing will be the returned `last_hidden_states`.

In [23]:
input_ids = torch.tensor(padded).to(torch.int64)
attention_mask = torch.tensor(attention_mask).to(torch.int64)

In [24]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

Next, let's slice the output that we need for classification.  The way BERT does sentence classification is by adding a token called `[CLS]` (for classification) at the beginning of every sentence.  The output corresponding to that token can be thought of as an embedding for the entire sentence.

In [25]:
# Below:
  # the first `:` corresponds to all sentences
  # the 0 after `:` means the first position, [CLS]
  # the second `:` means all hidden unit outputs
features = last_hidden_states[0][:,0,:]

The `features` variable above is our X.  The `labels` variable below is our Y.

In [26]:
features.shape

torch.Size([2000, 768])

In [27]:
labels = batch_1[1]

## Model #2: Logistic Regression for Classification

First, let's split our dataset into a training and testing set.

In [28]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [29]:
train_features.shape

torch.Size([1500, 768])

In [30]:
lr_classifier = LogisticRegression()
lr_classifier.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
lr_classifier.score(test_features, test_labels)

0.828

## Predictions

In [32]:
sentences = [
             "This movie was really awesome!",                                                  # 1 (Positive)
             "What a complete waste of time, I want my money back.",                  # 0 (Negative)
             "I loved this movie.  It was great and I will watch it again",                     # 1 (Positive)
             "I did not like it.  It was the worst thing I have seen",                          # 0 (Negative)
             "What an amazing movie!  I thought it was going to be terrible, but it wasn't!",   # 1 (Positive)
             "What a terrible movie!  I thought it was going to be amazing, but it wasn't!"     # 0 (Negative)
]

In [33]:
# sample = tokenizer.encode(sentences, add_special_tokens=True)
# sample = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

sample = [tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences]

In [34]:
padded_sample = np.array([value + [0]*(max_len - len(value)) for value in sample])

In [35]:
padded_sample.shape

(6, 59)

In [36]:
attention_mask_sample = np.where(padded_sample !=0, 1, 0)
attention_mask_sample

array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [37]:
input_id_sample = torch.tensor(padded_sample).to(torch.int64)
attention_mask_sample = torch.tensor(attention_mask_sample).to(torch.int64)

In [38]:
with torch.no_grad():
    last_hidden_states_sample = model(input_id_sample, attention_mask=attention_mask_sample)

In [39]:
features_sample = last_hidden_states_sample[0][:,0,:]
features_sample.shape

torch.Size([6, 768])

In [40]:
lr_classifier.predict(features_sample)

array([1, 0, 1, 0, 1, 1], dtype=int64)

How good is this score?  What can we compare it against?  Let's use a dummy classifier

In [41]:
from sklearn.dummy import DummyClassifier
dummy_classifier = DummyClassifier()

scores = cross_val_score(dummy_classifier, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.486 (+/- 0.04)


So our model clearly does better than a dummy classifier. But how does it compare against the best models?