## Set-up imports

In [82]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import BertModel, BertTokenizer

import torch
import logging
import warnings
warnings.filterwarnings('ignore')

## Download and import the data

In [83]:
# !wget https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv --no-check-certificate

In [84]:
DATA_HOME = os.path.join("data", 'train.tsv')

In [85]:
DATA_HOME

'data\\train.tsv'

In [86]:
df = pd.read_csv(DATA_HOME, delimiter='\t', header=None)

In [87]:
df.shape

(6920, 2)

In [88]:
# Only use 2000 sentences from the dataset, for performance reasons
batch_1 = df[:2000]

In [89]:
batch_1.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


We can ask pandas how many sentences are labeled as "positive" (value 1) and how many are labeled "negative" (having the value 0)

In [90]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

## Load the pre-trained BERT models

We are going to use both DistilBert and regular Bert.

In [91]:
# DistilBert
pretrained_weights = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
model = DistilBertModel.from_pretrained(pretrained_weights)

In [92]:
# # BERT
# bert_pretrained_weights = 'bert-base-uncased'
# bert_tokenizer = BertTokenizer.from_pretrained(bert_pretrained_weights)
# bert_model = BertModel.from_pretrained(bert_pretrained_weights)

## Prepare the Dataset

### Option 1: Tokenize the sentences

First, we need to tokenize/encode the sentences in `df`.

In [93]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [94]:
tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object

### Option 1: Padding

Here, we pad the sentences to the max length

In [95]:
max_len = 0
for sentence in tokenized.values:
    if len(sentence) > max_len:
        max_len = len(sentence)

padded = np.array([value + [0]*(max_len - len(value)) for value in tokenized.values])

In [96]:
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

### Option 1: Masking

In [97]:
attention_mask = np.where(padded !=0, 1, 0)
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

### Batched tokenization, padding, and masking

In [98]:
batch_tokenized = tokenizer.batch_encode_plus(
    batch_1[0], 
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True)

In [99]:
batch_tokenized.keys()

dict_keys(['input_ids', 'attention_mask'])

In [100]:
batched_input_ids = np.array(batch_tokenized['input_ids'])
batched_attention_mask = np.array(batch_tokenized['attention_mask'])

In [101]:
print(batched_input_ids.shape)
batched_input_ids

(2000, 59)


array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [102]:
print(batched_attention_mask.shape)
batched_attention_mask

(2000, 59)


array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

## Model #1: Deep Learning!

The `model()` function runs our sentences through BERT.  The results of the processing will be the returned `last_hidden_states`.

In [108]:
input_ids = torch.tensor(padded).to(torch.int64)
attention_mask = torch.tensor(attention_mask).to(torch.int64)

In [109]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)