In [1]:
import pandas as pd
from tqdm import tqdm
import time
import torch
import warnings
warnings.filterwarnings('ignore') 

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
print(f"Device type : {device}")

Device type : cuda


In [3]:
from transformers import BertModel, BertTokenizer

- Bert is a encoder based model, it learns bidirectional method. Both learns left and right at the same time.
- It uses Masked Language Model
- Word meaning is determined both left hand side and right hand side words by doing this, more comprehensive and deep meaning is captured. 
- Some use cases: QA, sentiment analysis, sentence pair classification (finding sentence similarity), NER, POS tagging

In [4]:
model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16).to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", torch_dtype=torch.float16)

In [5]:
model # There is no decoder unit/section

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
sentence = "Hi I am working Machine Learning Deep Learning and Natural Language Processing"

In [7]:
tokens = tokenizer.tokenize(sentence)
tokens

['hi',
 'i',
 'am',
 'working',
 'machine',
 'learning',
 'deep',
 'learning',
 'and',
 'natural',
 'language',
 'processing']

In [8]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)
print(f"len(tokens): {len(tokens)}")

['[CLS]', 'hi', 'i', 'am', 'working', 'machine', 'learning', 'deep', 'learning', 'and', 'natural', 'language', 'processing', '[SEP]']
len(tokens): 14


In [9]:
two_power = 2
while (two_power<len(tokens)): two_power *=2
print(f"two_power: {two_power}") # find minimum two power

two_power: 16


In [10]:
for i in range(len(tokens), two_power):
    tokens += ['[PAD]']
print(f"tokens : {tokens}")

tokens : ['[CLS]', 'hi', 'i', 'am', 'working', 'machine', 'learning', 'deep', 'learning', 'and', 'natural', 'language', 'processing', '[SEP]', '[PAD]', '[PAD]']


In [11]:
# get attention mask and tokens ids of given text

In [12]:
attention_mask = [1 if i != '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [13]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"token_ids: {token_ids}")

token_ids: [101, 7632, 1045, 2572, 2551, 3698, 4083, 2784, 4083, 1998, 3019, 2653, 6364, 102, 0, 0]


In [14]:
# add 1 dimensional to these for feeding torch

In [15]:
token_ids = torch.tensor(token_ids).unsqueeze(0) # add 1 dimensional for pytorch
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 60%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h2 style="padding: 10px; color: white;">Getting the embedding</h2>
</div>

In [16]:
embedding = model(token_ids.cuda(), attention_mask = attention_mask.cuda())
print(f"The Embedding which is generated from bert model:\n {embedding}")

The Embedding which is generated from bert model:
 BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0384,  0.3845, -0.1342,  ..., -0.2157,  0.2859,  0.4897],
         [ 0.6211,  0.2075,  0.0930,  ..., -0.4775,  0.8149, -0.1230],
         [ 0.4028,  0.2793, -0.1249,  ..., -0.3489, -0.1194, -0.0690],
         ...,
         [ 0.5371,  0.2135, -0.2998,  ...,  0.0432, -0.6787, -0.0552],
         [ 0.0261,  0.0324,  0.2257,  ...,  0.0723, -0.2043,  0.1582],
         [-0.0630, -0.1371,  0.2262,  ...,  0.3215, -0.1180,  0.1091]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-9.1406e-01, -5.3760e-01, -9.5459e-01,  8.7744e-01,  7.9834e-01,
         -3.6963e-01,  8.9111e-01,  3.6401e-01, -8.2959e-01, -1.0000e+00,
         -3.4839e-01,  9.7656e-01,  9.7754e-01,  7.0801e-01,  8.8330e-01,
         -7.9883e-01, -6.4990e-01, -7.1387e-01,  4.3115e-01, -3.6572e-01,
          7.5488e-01,  1.0000e+00, -2.1851e-01,  4.4

In [17]:
print(embedding[0].shape) # 1 is added above with unsqueeze, 16 is num of token ids, 768 is feature vector size for each token

torch.Size([1, 16, 768])


<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 60%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h2 style="padding: 10px; color: white;">More basic and fast way below</h2>
</div>

In [18]:
text = "Hi I am a computer engineer"
encoded_input = tokenizer(text, return_tensors='pt').to(device)
output = model(**encoded_input)
print(f"output : {output}")

output : BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1469,  0.4968, -0.0965,  ..., -0.3440,  0.4072,  0.3835],
         [ 1.1006,  0.5830,  0.3108,  ..., -0.3135,  1.0469, -0.4131],
         [ 0.4285, -0.0618, -0.3525,  ..., -0.4912,  0.2061,  0.1394],
         ...,
         [-0.3845,  0.4897,  0.2377,  ..., -0.6528,  0.0277, -0.3875],
         [ 0.3494,  0.0120, -0.6250,  ..., -0.4351,  0.1409, -0.3704],
         [ 0.6099,  0.1139, -0.3408,  ...,  0.1274, -0.3647, -0.2844]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.7905, -0.3721, -0.6030,  0.6353,  0.5542, -0.0818,  0.7104,  0.2168,
         -0.2081, -1.0000, -0.1654,  0.8662,  0.9756,  0.1624,  0.8545, -0.5820,
         -0.2283, -0.5332,  0.2202, -0.0406,  0.5293,  0.9995,  0.3635,  0.2527,
          0.2622,  0.9248, -0.6997,  0.8774,  0.9370,  0.6680, -0.5293,  0.1333,
         -0.9883, -0.0666, -0.6631, -0.9854,  0.3103, -0.6777,  0.

<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 60%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h2 style="padding: 10px; color: white;">Fill in the blank([MASK])</h2>
</div>

In [19]:
from transformers import pipeline

unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("The man worked as a [MASK].")

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

[{'score': 0.09747566282749176,
  'token': 10533,
  'token_str': 'carpenter',
  'sequence': 'the man worked as a carpenter.'},
 {'score': 0.052383266389369965,
  'token': 15610,
  'token_str': 'waiter',
  'sequence': 'the man worked as a waiter.'},
 {'score': 0.04962713271379471,
  'token': 13362,
  'token_str': 'barber',
  'sequence': 'the man worked as a barber.'},
 {'score': 0.03788609057664871,
  'token': 15893,
  'token_str': 'mechanic',
  'sequence': 'the man worked as a mechanic.'},
 {'score': 0.03768087923526764,
  'token': 18968,
  'token_str': 'salesman',
  'sequence': 'the man worked as a salesman.'}]

In [20]:
predictions = unmasker("The man worked as a [MASK].")
print("From high to low scored predictions are listed as : ")
for i in predictions:
    print(i["token_str"])

From high to low scored predictions are listed as : 
carpenter
waiter
barber
mechanic
salesman


In [21]:
unmasker("The woman worked as a [MASK].")

[{'score': 0.2198147177696228,
  'token': 6821,
  'token_str': 'nurse',
  'sequence': 'the woman worked as a nurse.'},
 {'score': 0.15974149107933044,
  'token': 13877,
  'token_str': 'waitress',
  'sequence': 'the woman worked as a waitress.'},
 {'score': 0.11547322571277618,
  'token': 10850,
  'token_str': 'maid',
  'sequence': 'the woman worked as a maid.'},
 {'score': 0.03796886280179024,
  'token': 19215,
  'token_str': 'prostitute',
  'sequence': 'the woman worked as a prostitute.'},
 {'score': 0.030423881486058235,
  'token': 5660,
  'token_str': 'cook',
  'sequence': 'the woman worked as a cook.'}]

In [22]:
predictions = unmasker("The woman worked as a [MASK].")
print("From high to low scored predictions are listed as : ")
for i in predictions:
    print(i["token_str"])

From high to low scored predictions are listed as : 
nurse
waitress
maid
prostitute
cook


<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 60%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h4 style="padding: 10px; color: white;">Roberta sentiment</h4>
</div>

In [23]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, logging

logging.set_verbosity_error()

roberta_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
roberta_tokenizer_sentiment = RobertaTokenizer.from_pretrained(roberta_model_name)
roberta_model_sentiment = RobertaForSequenceClassification.from_pretrained(roberta_model_name) # , num_labels=2
roberta_sentiment_analyzer = pipeline("sentiment-analysis", model=roberta_model_sentiment, tokenizer=roberta_tokenizer_sentiment, device="cuda:0")

<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 60%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h4 style="padding: 10px; color: white;">Bert sentiment</h4>
</div>

In [24]:
from transformers import BertTokenizer, BertForSequenceClassification
# sentiment analysis pipeline
bert_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
bert_tokenizer_sentiment = BertTokenizer.from_pretrained(bert_model_name)
bert_model_sentiment = BertForSequenceClassification.from_pretrained(bert_model_name, ignore_mismatched_sizes=True)
bert_sentiment_analyzer = pipeline(task='sentiment-analysis', model=bert_model_sentiment, tokenizer=bert_tokenizer_sentiment, device="cuda:0")

In [25]:
sentence_list = ["I'm not sure you're being honest.",
"I'm sure you're being honest.",
"I love you.",
"I like you",
"I hate you",
"You are disgusting",
"I am neutral about your attitude.",
"Today, I am feeling depressed.",
"Today, I am feeling nothing.",
"Today, I am feeling anything.",
"Today, I went to grocery.",
"Today, I went to grocery to buy sugar.",
"Today, I went to the grocery store with notr employees."]

In [26]:
import re

def print_sentiment_df(sentence_list:list, analyzer:pipeline)->dict:
    if(sentence_list == None or analyzer == None):
        return None
  
    result = {}
    class_name = re.findall(r'[A-Za-z]+', str(type(analyzer.model.base_model)).split('.')[-1])[0]
    if (type(analyzer.model) == BertForSequenceClassification):
        result[class_name]= []
        for i in sentence_list:
            if (sentence != None and sentence != "" and type(sentence)==str):
                index = int(analyzer(i)[0]["label"].split()[0])
                sentiment_map = {1: "Very Negative", 2: "Negative", 3: "Neutral", 4: "Positive", 5: "Very Positive"}
                score = analyzer(i)[0]["score"]
                new_row = {'sentence': i, 'sentiment': sentiment_map[index], "score": score}
                result[class_name].append(new_row)
    elif(type(analyzer.model) == RobertaForSequenceClassification):
        result[class_name]= []
        for i in sentence_list:
            if (sentence != None and sentence != "" and type(sentence)==str):
                sentiment = analyzer(i)[0]["label"]
                score = analyzer(i)[0]["score"]
                new_row = {'sentence': i, 'sentiment':sentiment, "score":score}
                result[class_name].append(new_row)
    return result

In [27]:
bert_dict = print_sentiment_df(sentence_list, bert_sentiment_analyzer)
roberta_dict = print_sentiment_df(sentence_list, roberta_sentiment_analyzer)
bert_df = pd.DataFrame.from_records(data=next(iter(bert_dict.values())))
roberta_df = pd.DataFrame.from_records(data=next(iter(roberta_dict.values())))
result_df = pd.concat([bert_df, roberta_df], keys=["bert", "roberta"], names=["llm_name", "sentence_id"] ,axis=0)
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence,sentiment,score
llm_name,sentence_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert,0,I'm not sure you're being honest.,Neutral,0.47772
bert,1,I'm sure you're being honest.,Neutral,0.351202
bert,2,I love you.,Very Positive,0.871876
bert,3,I like you,Very Positive,0.474995
bert,4,I hate you,Very Negative,0.634607
bert,5,You are disgusting,Very Negative,0.744359
bert,6,I am neutral about your attitude.,Neutral,0.348305
bert,7,"Today, I am feeling depressed.",Negative,0.460997
bert,8,"Today, I am feeling nothing.",Negative,0.346679
bert,9,"Today, I am feeling anything.",Neutral,0.276451


<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 80%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h3 style="padding: 10px; color: white;">Result: You can compare results of Roberta and Bert model for sentiment analysis.</h3>
</div>

<div style="color: white; display: block; border-radius: 5px; background-color: #09ba73; width: 100%; height: 80%; font-size: 110%; font-family: Verdana; letter-spacing: 0.5px;">
    <h3 style="padding: 10px; color: white;">NER - Named Entity Recognition Example</h3>
</div>

In [28]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, BertForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

example = "I have been working on artificial intelligence in Istanbul since my student years."
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-LOC', 'score': 0.9997718, 'index': 9, 'word': 'Istanbul', 'start': 50, 'end': 58}]


In [29]:
# end