### Bert Tutorial
https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

In [1]:
print('Bert Tutorial')

Bert Tutorial


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')



In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
batch_1 = df[:2000]
batch_1[1].value_counts()

batch_1

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
1995,too bland and fustily tasteful to be truly pru...,0
1996,it does n't work as either,0
1997,this one aims for the toilet and scores a dire...,0
1998,in the name of an allegedly inspiring and easi...,0


In [5]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [6]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object

In [7]:
max_len = max(tokenized.apply((lambda x: len(x))))
print(f'max_len: {max_len}')

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
padded

max_len: 59


array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [8]:
np.array(padded).shape

(2000, 59)

In [9]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [10]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [11]:
input_ids

tensor([[  101,  1037, 18385,  ...,     0,     0,     0],
        [  101,  4593,  2128,  ...,     0,     0,     0],
        [  101,  2027,  3653,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  2028,  ...,     0,     0,     0],
        [  101,  1999,  1996,  ...,     0,     0,     0],
        [  101,  1996,  3185,  ...,     0,     0,     0]])

In [12]:
last_hidden_states[0]

tensor([[[-0.2159, -0.1403,  0.0083,  ..., -0.1369,  0.5867,  0.2011],
         [-0.2471,  0.2468,  0.1008,  ..., -0.1631,  0.9349, -0.0715],
         [ 0.0558,  0.3573,  0.4140,  ..., -0.2430,  0.1770, -0.5080],
         ...,
         [-0.0165,  0.1179,  0.3512,  ..., -0.2401,  0.2722, -0.1750],
         [ 0.0961,  0.0667,  0.3147,  ..., -0.3277,  0.3556, -0.2135],
         [ 0.0454,  0.0519,  0.3168,  ..., -0.2880,  0.1844, -0.1042]],

        [[-0.1726, -0.1448,  0.0022,  ..., -0.1744,  0.2139,  0.3720],
         [ 0.0022,  0.1684,  0.1269,  ..., -0.1888, -0.0195, -0.0283],
         [ 0.0257, -0.2458,  0.0717,  ..., -0.4339,  0.1622,  0.0133],
         ...,
         [ 0.0505, -0.0493,  0.0463,  ..., -0.0448, -0.0540,  0.3136],
         [-0.2128, -0.1907, -0.0215,  ...,  0.0139, -0.2433, -0.0202],
         [-0.1310, -0.1693,  0.1019,  ..., -0.0859, -0.1770, -0.0872]],

        [[-0.0506,  0.0720, -0.0296,  ..., -0.0715,  0.7185,  0.2623],
         [ 0.0536,  0.3136, -0.0598,  ...,  0

In [13]:
features = last_hidden_states[0][:,0,:].numpy()
features

array([[-0.21593429, -0.14028901,  0.00831076, ..., -0.13694839,
         0.5867005 ,  0.20112702],
       [-0.17262708, -0.14476173,  0.00223445, ..., -0.17442553,
         0.21386455,  0.37197486],
       [-0.05063363,  0.07203954, -0.02959726, ..., -0.07148956,
         0.7185241 ,  0.26225474],
       ...,
       [-0.2782979 , -0.24803601,  0.13585806, ..., -0.19039169,
         0.1309957 ,  0.3497835 ],
       [-0.03667723,  0.10638562, -0.0111102 , ..., -0.1120664 ,
         0.4161947 ,  0.5033798 ],
       [ 0.12402631,  0.01425166,  0.01038423, ..., -0.11606556,
         0.53459144,  0.27495325]], dtype=float32)

In [14]:
labels = batch_1[1]
labels

0       1
1       0
2       0
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: 1, Length: 2000, dtype: int64

In [15]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [16]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [17]:
lr_clf.score(test_features, test_labels)

0.812

In [18]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.528 (+/- 0.00)


In [20]:
new_sentences = ["I love this movie.", "It was not good."]
lr_clf.predict(new_sentences)

ValueError: Expected 2D array, got 1D array instead:
array=['I love this movie.' 'It was not good.'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.