<a href="https://colab.research.google.com/github/MaQuest/Summer2021/blob/main/INPUT_TO_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from transformers import BertModel, BertTokenizer
import pandas as pd
import numpy as np
import random
import torch

In [None]:

MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased')]

Using bert-base uncased model


# LOADING OUR BERT MODEL AND TOKENIZER FROM IN-BUILT BERT 


In [None]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# READING OUR SST-2 SENTIMENT BANK DATA

In [None]:

df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

# USING 4000 SENTENCES FOR FASTER PROCESS

In [None]:

batch = df[:2000]

# TOKENIZING AND PADDING OUR DATA

In [None]:
def tokenize_cut_pad(df):
    
    df = df.copy()
    
    max_input_size = tokenizer.max_model_input_sizes['bert-base-uncased']
    
    # shorten sequences longer than BERT max input size
    df[0] = [text[:max_input_size - 2] for text in df[0].values] 
    tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True))) # tokenizes and converts tokens to ids, includes special tokens
    
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            # max_len will be equal to longest sequence in the tokenized values
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
    return torch.tensor(padded)

# Get BERT model embedding for each CLS token in each example

In [None]:
input_ids = tokenize_cut_pad(batch)


In [None]:
with torch.no_grad():
    last_hidden_states = bert_model(input_ids)[0]

# STORING LAST_HIDDEN_STATE IN VARIABLE FEATURES

In [None]:
features = last_hidden_states[:,0,:].numpy()

# STORING THE TEST SENTIMENT WHETHER 0 OR 1 IN LABELS

In [None]:

labels = batch[1]

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

# DEFAULT SPLIT TO 75-25%

In [None]:

print(train_features.shape)
print(test_features.shape)

(1500, 768)
(500, 768)


# INITIALIZING OUR MODEL

In [None]:

model = LogisticRegression(solver='lbfgs')
model.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# TESTING OUR MODEL

In [None]:
model.score(test_features, test_labels)

0.792

# PREDICTION OF MODEL

In [None]:
def prediction(text,a):
    
    input_text = tokenizer.encode(text)
    test_input_ids = torch.tensor(input_text)
    test_input_ids = test_input_ids.unsqueeze(0)
    with torch.no_grad():
        hidden_states = bert_model(test_input_ids)[0]
    test_features = hidden_states[:, 0, :].numpy()
    pred = model.predict(test_features)[0]
    a = pred
    if pred == 1:
        return "This is a positive statement",a
    else:
        return "This is a negative statement",a

# INPUT SENTENCE FROM USER

In [None]:
string = str(input()) 

t = 0.0

score = model.score(test_features, test_labels)

sentiment,pred = prediction(string,t)

print(sentiment + " " + " with sentiment label :" + str(pred)+ " ")

don't be a karen
This is a negative statement  with sentiment label :0 
