In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [7]:
batch_1 = df[:64]

In [8]:
# 
batch_1.columns=['sentence','label']
batch_1['label'].value_counts()

1    39
0    25
Name: label, dtype: int64

In [9]:
batch_1.head()

Unnamed: 0,sentence,label
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights).to(device)

In [13]:
tokenized = batch_1['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [14]:
print(tokenized[0:3])

0    [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1    [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2    [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
Name: sentence, dtype: object


In [15]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        print(len(i))
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

20
45
54


In [16]:
print(padded)

[[  101  1037 18385 ...     0     0     0]
 [  101  4593  2128 ...     0     0     0]
 [  101  2027  3653 ...     0     0     0]
 ...
 [  101  1037 19240 ...     0     0     0]
 [  101  2009  1005 ...     0     0     0]
 [  101  1996  6919 ... 11008  3436   102]]


In [17]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(64, 54)

In [18]:
print(attention_mask)

[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 1 1 1]]


In [20]:
input_ids = torch.LongTensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [23]:
features = last_hidden_states[0][:,0,:].cpu().numpy()

In [25]:
labels = batch_1['label']

In [31]:
print(len(last_hidden_states))
print(last_hidden_states[0].shape)
print(features.shape)
print(labels.shape)

1
torch.Size([64, 54, 768])
(64, 768)
(64,)


In [30]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [32]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.578 (+/- 0.29)
