In [None]:
!pip install transformers[sentencepiece] -qq

[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
[K     |████████████████████████████████| 596 kB 29.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 1.6 MB/s 
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
[K     |████████████████████████████████| 895 kB 45.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 24.9 MB/s 
[?25h

In [None]:
import random
from typing import List

import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer

In [None]:
MODEL_NAME: str = "bert-base-uncased"

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lowercase=True)
bert_base_model = BertModel.from_pretrained(MODEL_NAME)
bert_large_model = BertModel.from_pretrained(MODEL_NAME.replace("base", "large"))

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [None]:
bert_tokenizer.save_pretrained(MODEL_NAME + "-tokenizer")

The paper for BERT is from the authors Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova https://arxiv.org/abs/1810.04805

In [None]:
bert_base_model.encoder

In [None]:
bert_large_model.encoder

In [None]:
bert_base_d_model = bert_base_model.embeddings.word_embeddings.embedding_dim
print("BERT Base D Model: %d" %bert_base_d_model)

bert_large_d_model = bert_large_model.embeddings.word_embeddings.embedding_dim
print("BERT Large D Model: %d" %bert_large_d_model)

In [None]:
text: str = "The cat sat on it because it was a nice rug."
text_tokens = bert_tokenizer.tokenize(text)
print(text_tokens)

In [None]:
bert_tokenizer.encode(text)

In [None]:
# A lis tof special tokens and their menaings
bert_tokenizer.special_tokens_map

In [None]:
random.choice([1,2,3])

In [None]:
len_tokens = len(["The", "bhur", "swe"])
print(len_tokens)
mask_index = random.choice(tuple(range(0, len_tokens)))
print(mask_index)

In [None]:
def random_masker(text_tokens: List[str]):
    len_tokens = len(text_tokens)
    mask_index = random.choice(tuple(range(0, len_tokens)))
    return ["[MASK]" if (i==mask_index) else text_tokens[i] for i in range(0, len_tokens)]

random_masker(["The", "bhur", "swe"])

[CLS] -> clssfiation token to show the start of the sequence

[SEP] -> seperates 2 sequence and also at the end of sequence

In [None]:
token_ids = bert_tokenizer.encode(text)
bert_tokenizer.decode(token_ids)

# With PyTorch

In [None]:
import torch

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "gpu")
num_gpus = torch.cuda.device_count()
print(f"%d GPU(s) found" %num_gpus)

# With Tensorflow

In [None]:
import os
import tensorflow as tf

In [None]:
gpu_devices = tf.config.list_physical_devices("GPU")
has_gpu = len(gpu_devices) >= 1
if has_gpu:
    print("GPU Found")
else:
    raise SystemError("GPU not found!!")

print("Foung devices are")
for gpu in gpu_devices:
    print(gpu.name)

In [None]:
try:
    import transformers
except ImportError:
    os.system("pip install transformers -qq")
    import transformers

In [None]:
if not os.path.exists("cola_public"):
    !wget https://nyu-mll.github.io/CoLA/cola_public_1.1.zip && unzip cola_public_1.1.zip && rm -r cola_public_1.1.zip

--2022-05-01 13:55:45--  https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
Resolving nyu-mll.github.io (nyu-mll.github.io)... 185.199.111.153, 185.199.109.153, 185.199.108.153, ...
Connecting to nyu-mll.github.io (nyu-mll.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255330 (249K) [application/zip]
Saving to: ‘cola_public_1.1.zip’


2022-05-01 13:55:45 (8.27 MB/s) - ‘cola_public_1.1.zip’ saved [255330/255330]

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [None]:
headers = ["source code", "label", "author", "sentence"]

In [None]:
df = pd.read_csv("cola_public/raw/in_domain_train.tsv", sep="\t", names=headers)
df.head()

Unnamed: 0,source code,label,author,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [None]:
df["sentence"][1]

"One more pseudo generalization and I'm giving up."

In [None]:
df[df.label == 0]["sentence"]

18                                    They drank the pub.
20                               The professor talked us.
22                                   We yelled ourselves.
23                                We yelled Harry hoarse.
25                                 Harry coughed himself.
                              ...                        
8531                          Anson believed to be happy.
8539                 Anson left before Jenny saw himself.
8545    Anson thought that himself was going to the club.
8546                     Poseidon appears to own a dragon
8547                       Digitize is my happiest memory
Name: sentence, Length: 2528, dtype: object

In [None]:
sentences = df.sentence.values
labels = df.label.values

In [None]:
tokenized_sentences = [bert_tokenizer.encode(sentence) for sentence in sentences]
print("First Sentence:")
print(tokenized_sentences[0])

First Sentence:
[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102]


In [None]:
tokenized_sentences = [bert_tokenizer.tokenize(sentence) for sentence in sentences]
print("First Sentence:")
print(tokenized_sentences[0])

First Sentence:
['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']


In [None]:
MAX_LEN = 128

In [None]:
text_ids = bert_tokenizer.convert_tokens_to_ids(tokenized_sentences[0])
print(text_ids)

[2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


In [None]:
special_tokens_map = bert_tokenizer.special_tokens_map
print(special_tokens_map)

_ = {value: bert_tokenizer.convert_tokens_to_ids([value]) for key, value in special_tokens_map.items()}
print(_)

{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
{'[UNK]': [100], '[SEP]': [102], '[PAD]': [0], '[CLS]': [101], '[MASK]': [103]}


In [None]:
bert_tokenizer.convert_ids_to_tokens(text_ids)

['our',
 'friends',
 'won',
 "'",
 't',
 'buy',
 'this',
 'analysis',
 ',',
 'let',
 'alone',
 'the',
 'next',
 'one',
 'we',
 'propose',
 '.']

In [None]:
sentences[0]

"Our friends won't buy this analysis, let alone the next one we propose."

In [None]:
input_ids = [bert_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_sentences]

In [None]:
print(f"Num: {len(input_ids)}")
print(input_ids[0])

Num: 8551
[2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


In [None]:
from pprint import pprint
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
padded_input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, padding="post", truncating="post")
padded_input_ids

array([[ 2256,  2814,  2180, ...,     0,     0,     0],
       [ 2028,  2062, 18404, ...,     0,     0,     0],
       [ 2028,  2062, 18404, ...,     0,     0,     0],
       ...,
       [ 2009,  2003,  3733, ...,     0,     0,     0],
       [ 1045,  2018,  1996, ...,     0,     0,     0],
       [ 2054,  2035,  2106, ...,     0,     0,     0]], dtype=int32)

## Creating Attention Masks

Attention Mask helps the model to avoid performing attention on padding values

In [None]:
attention_masks = []
for input_ids in padded_input_ids:
    seq_mask = [float(id > 0) for id in input_ids]
    attention_masks.append(seq_mask)

pprint(attention_masks[:10])

[[1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0

### Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
labels

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
train_ids, val_ids, train_labels, val_labels = train_test_split(padded_input_ids, labels, random_state=45, test_size=0.1)
train_ids, test_ids, train_labels, test_labels = train_test_split(train_ids, train_labels, random_state=45, test_size=0.1)

print("Train Lengths")
print("Train ID length:", len(train_ids))
print("Train Labels length:", len(train_labels), end="\n\n")

print("Val Lengths")
print("Val ID length:", len(val_ids))
print("Val Labels length:", len(val_labels), end="\n\n")

print("Test Lengths")
print("Test ID length:", len(test_ids))
print("Test Labels length:", len(test_labels), end="\n\n")

Train Lengths
Train ID length: 6925
Train Labels length: 6925

Val Lengths
Val ID length: 856
Val Labels length: 856

Test Lengths
Test ID length: 770
Test Labels length: 770



In [None]:
train_inputs = torch.tensor(train_ids)
train_labels = torch.tensor(train_labels)

val_inputs = torch.tensor(val_ids)
val_labels = torch.tensor(val_labels)

test_inputs = torch.tensor(test_ids)
test_inputs = torch.tensor(test_labels)

  
  """


In [None]:
from torch.utils.data import TensorDataset, RandomSampler, Dataset, DataLoader

In [None]:
BATCH_SIZE = 32

In [None]:
train_inputs

tensor([[ 1996, 15046,  2029,  ...,     0,     0,     0],
        [ 1045,  2081,  1996,  ...,     0,     0,     0],
        [ 1045, 15307,  4116,  ...,     0,     0,     0],
        ...,
        [ 1045,  4687,  2065,  ...,     0,     0,     0],
        [ 6821, 28620,  2063,  ...,     0,     0,     0],
        [13097, 26393,  2232,  ...,     0,     0,     0]], dtype=torch.int32)

In [None]:
dataset = TensorDataset(train_inputs)

In [None]:
len(dataset)

6925

In [None]:
_ = RandomSampler(dataset, num_samples=350)

In [None]:
from transformers import BertConfig

In [None]:
BertConfig()

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
BertConfig()

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
set(bert_base_model.config.to_dict().keys()).difference(set(BertConfig().to_dict().keys()))