In [4]:
import numpy as np
import pandas as pd
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import warnings
warnings.filterwarnings('ignore')

In [5]:
## Imorting Dataset
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df1 = df[:2000]


In [8]:
## TO check positive and negative emotions examples count
df1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [10]:
## Now DistillBert comes into the scene
model_class , tokenizer_class , pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
#For converting sentences to tokens
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Converting Words of Sentences to Tokens, So that we can pass them to DistilBert Model**

In [11]:
tokenized = df1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens = True)))

### Padding
After tokenization, `tokenized` is a list of sentences -- each sentences is represented as a list of tokens. We want BERT to process our examples all at once (as one batch). It's just faster that way. For that reason, we need to pad all lists to the same size, so we can represent the input as one 2-d array, rather than a list of lists (of different lengths).

In [13]:
max = 0
for i in tokenized.values:
  #if length of any token is greater than the max number (we have initiated)
  if len(i) > max:
    #max become that number
    max = len(i)
    #means max will have number that will be length of greatest token

                    #so we are padding 0's where we have difference in the lengths
padded = np.array([i + [0]*(max-len(i)) for i in tokenized.values])

In [14]:
padded.shape

(2000, 59)

### Masking
If we directly send `padded` to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what attention_mask is:

In [15]:
#np.where will return the indices of array that specify a paricular position given in brackets

attention_mask = np.where(padded != 0, 1, 0)

## DistilBert Model

**Now as our Input is ready lets pass it our model**

In [17]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

#The model() function runs our sentences through BERT. 
#The results of the processing will be returned into last_hidden_states


    #no_grad -> disables the gradient
    #reduce memory consumption for computations that would otherwise have requires_grad=True.
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

## Slicinggo
Now as we 3D vector now we have to pass the [CLS] to Logistic Regression Algorithm, for that we have to slice the output first

In [18]:
features = last_hidden_states[0][:,0,:]

In [19]:
# Output labels
labels = df1[1]


**Let's now split our datset into a training set and testing set**


In [20]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [22]:
#now logistic regression comes into scene 
logistic_model = LogisticRegression()


In [23]:
#we will use .fit() to train the logistic model with our data
logistic_model.fit(train_features, train_labels)

LogisticRegression()

## Model Evaluation 

In [24]:
# now we will pass the test examples and will check how well model we have trained,
# do on the test set

logistic_model.score(test_features,test_labels)

0.824

## Prediction

In [25]:
yhat = logistic_model.predict(test_features)

We can also find accuracy like this

In [27]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels,yhat)

In [28]:
print(accuracy)

0.824


In [32]:
print(test_features)

tensor([[-0.0501,  0.0184, -0.1272,  ...,  0.0196,  0.3160,  0.4354],
        [ 0.2962,  0.0051,  0.0787,  ..., -0.0387,  0.6847,  0.2981],
        [ 0.0603, -0.0931,  0.2275,  ..., -0.1673,  0.4916,  0.2293],
        ...,
        [ 0.0354, -0.0766, -0.0865,  ..., -0.1761,  0.5859,  0.4691],
        [-0.2405, -0.1622,  0.1463,  ..., -0.1621,  0.2943,  0.2018],
        [ 0.2656, -0.0951, -0.2131,  ..., -0.1104,  0.6936,  0.2990]])


In [33]:
print(yhat)

[0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 0
 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0
 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 0 0
 0 1 1 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0
 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 0 0 0
 1 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1
 1 1 0 1 0 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 0
 0 0 1 0 1 1 0 0 0 1 1 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1
 1 1 0 1 0 1 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 0 1
 0 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 0 1 1 1 0 1 1 1 0
 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0
 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 1
 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0
 0 1 1 1 1 1 0 1 1 1 0 0 