In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [21]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [22]:
batch_1 = df
batch_1.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


1 is POSITIVE, 0 is NEGATIVE

In [23]:
batch_1[1].value_counts()

1
1    3610
0    3310
Name: count, dtype: int64

# Pre-trained DistilBERT

In [24]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [25]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
print(tokenized[0])
print

[101,
 1037,
 18385,
 1010,
 6057,
 1998,
 2633,
 18276,
 2128,
 16603,
 1997,
 5053,
 1998,
 1996,
 6841,
 1998,
 5687,
 5469,
 3152,
 102]

# Padding
Pad all lists to the same size, so we can represent the input as one 2-d array, rather than a list of lists (of different lengths). Fill empty positions with 0s

In [26]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [27]:
np.array(padded).shape

(6920, 67)

# Masking
BERT will pay attention to position that has 1s

In [28]:

attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(6920, 67)

In [29]:
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], shape=(6920, 67))

In [30]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [31]:
last_hidden_states[0]

tensor([[[-0.2159, -0.1403,  0.0083,  ..., -0.1369,  0.5867,  0.2011],
         [-0.2471,  0.2468,  0.1008,  ..., -0.1631,  0.9349, -0.0715],
         [ 0.0558,  0.3573,  0.4140,  ..., -0.2430,  0.1770, -0.5080],
         ...,
         [ 0.1864,  0.0193,  0.1864,  ..., -0.2175,  0.1604, -0.4050],
         [-0.1004,  0.0651,  0.1240,  ..., -0.1649,  0.3568,  0.1218],
         [-0.0114,  0.3297,  0.2317,  ..., -0.2362,  0.4217,  0.0895]],

        [[-0.1726, -0.1448,  0.0022,  ..., -0.1744,  0.2139,  0.3720],
         [ 0.0022,  0.1684,  0.1269,  ..., -0.1888, -0.0195, -0.0283],
         [ 0.0257, -0.2458,  0.0717,  ..., -0.4339,  0.1622,  0.0133],
         ...,
         [ 0.0466,  0.0850,  0.1801,  ..., -0.0279,  0.1878,  0.4022],
         [-0.2325,  0.0746,  0.1298,  ..., -0.1292,  0.0904,  0.3647],
         [-0.0655, -0.2214,  0.1827,  ..., -0.1624,  0.1421,  0.0963]],

        [[-0.0506,  0.0720, -0.0296,  ..., -0.0715,  0.7185,  0.2623],
         [ 0.0536,  0.3136, -0.0598,  ...,  0

In [32]:
features = last_hidden_states[0][:,0,:].numpy()
features

array([[-0.21593426, -0.14028914,  0.00831078, ..., -0.13694835,
         0.58670044,  0.20112717],
       [-0.1726272 , -0.14476168,  0.00223421, ..., -0.17442545,
         0.21386449,  0.3719747 ],
       [-0.05063365,  0.07203963, -0.0295973 , ..., -0.07148956,
         0.71852404,  0.26225466],
       ...,
       [-0.06550958, -0.05184741, -0.14094464, ..., -0.06450666,
         0.6022301 ,  0.2134787 ],
       [-0.08523131, -0.04869819, -0.08137521, ..., -0.1358936 ,
         0.39505625,  0.22889708],
       [-0.2943683 , -0.09234673, -0.00831665, ..., -0.05159113,
         0.43497837,  0.28891587]], shape=(6920, 768), dtype=float32)

In [33]:
labels = batch_1[1]

# Train / Test Split

In [34]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)


In [35]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

# Evaluate

In [36]:
lr_clf.score(test_features, test_labels)


0.8491329479768787