### Hands-on - Embedding : Differences, Similarities & Surprises !
* We will look at embeddings from gensim,spacy, BERT and GPT2
* Eventhough we haven't discussed BERT and GPT in detail, we can still see what the embeddings do

In [25]:
import torch
print("torch ver :",torch.__version__)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#
# pip install https://github.com/pytorch/text/archive/master.zip
import torchtext
print(F'TorchText Ver : {torchtext.__version__}')
from torchtext import data
#
import matplotlib.pyplot as plt
%matplotlib inline
from torchsummary import summary
# pip install torchsummary
#
import numpy as np
import time

torch ver : 1.4.0
TorchText Ver : 0.5.1


In [44]:
is_cuda = False
if torch.cuda.is_available():
    is_cuda = True
print('Cuda : {:.1s}'.format(str(is_cuda)))

Cuda : F


In [1]:
# Ref: https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/
# Ref: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
#     #sphx-glr-auto-examples-tutorials-run-word2vec-py
# pip install -U gensim
import gensim.downloader as api
wv = api.load('word2vec-google-news-300') # 1662.8 MB ~2Gb ! Takes a long time. So commented out
print("== Loaded ==")

In [2]:
print(wv)
print(wv.vectors.shape)
print(F'gensim vocabulary size = {wv.vectors.shape[0]:,} // model dimensionality = {wv.vectors.shape[1]}')

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x1a1c161ef0>
(3000000, 300)
gensim vocabulary size = 3,000,000 // model dimensionality = 300


In [3]:
# pip install -U spacy
# python -m spacy download en_core_web_md
import spacy
nlp = spacy.load('en_core_web_md')

In [4]:
print(F'spacy vocabulary size = {nlp.vocab.length:,} // model dimensionality = {nlp.vocab.vectors_length}')

spacy vocabulary size = 1,340,242 // model dimensionality = 300


#### Let us do some semantic computations using the word vectors

In [13]:
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

car


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [14]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [17]:
# Common king - man + woman
wv.most_similar(positive=['woman', 'king'], negative=['man'])
# Library - Books = Hall
# Obama + Russia - USA = Putin
# Human - Animal = Ethics
# Ref ; http://byterot.blogspot.com/2015/06/five-crazy-abstractions-my-deep-learning-word2doc-model-just-did-NLP-gensim.html

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [5]:
wv.most_similar(positive=['Library'], negative=['Books'])

[('Public_Library', 0.38507434725761414),
 ('library', 0.3558593988418579),
 ('Historical_Museum', 0.331419438123703),
 ('Municipal_Building', 0.3273163437843323),
 ('Courthouse', 0.3168659210205078),
 ('Branch_Library', 0.3032086491584778),
 ('Museum', 0.2970375120639801),
 ('Historic_Courthouse', 0.2959327697753906),
 ('Annex_Building', 0.29544758796691895),
 ('Civic_Center', 0.28546392917633057)]

In [20]:
wv.most_similar(positive=['Obama', 'Russia'], negative=['USA']) # Obama + Russia - USA = ?

[('Medvedev', 0.673559308052063),
 ('Putin', 0.6472188234329224),
 ('Kremlin', 0.6166538000106812),
 ('President_Dmitry_Medvedev', 0.6108168959617615),
 ('President_Barack_Obama', 0.5944156646728516),
 ('President_Vladimir_Putin', 0.5936282873153687),
 ('Prime_Minister_Vladimir_Putin', 0.580956220626831),
 ('Lavrov', 0.5617890357971191),
 ('Dmitri_Medvedev', 0.5512057542800903),
 ('Dmitry_Medvedev', 0.5505295991897583)]

In [6]:
wv.most_similar(positive=['human'], negative=['animal'])

[('mankind', 0.3513610363006592),
 ('humankind', 0.34724855422973633),
 ('humanity', 0.3410438299179077),
 ('macrocosm', 0.3094217777252197),
 ('intelligence_HUMINT', 0.3089753985404968),
 ('corporeal', 0.3072369694709778),
 ('executive_Nancy_Tullos', 0.3065451681613922),
 ('multiplicities', 0.30226701498031616),
 ('Christine_Gaugler_head', 0.30204135179519653),
 ('perfections', 0.30177491903305054)]

In [7]:
wv.most_similar(positive=['liquid', 'ice'], negative=['Water']) # water:ice :: liquid: ?

[('unmelted', 0.509214460849762),
 ('Methane_hydrate', 0.46418970823287964),
 ('Francies_tossed', 0.45730510354042053),
 ('ice_crystals', 0.45635735988616943),
 ('starch_granules', 0.44293591380119324),
 ('Fill_cocktail_shaker', 0.43867558240890503),
 ('graphene_sheet', 0.4353669285774231),
 ('jellylike', 0.4344847798347473),
 ('ice_cubes', 0.43398386240005493),
 ('caked_oak_tree', 0.4337309002876282)]

### What exactly is the Methane Hydrate ?

In [8]:
wv.most_similar(positive=['Japan','Pizza'], negative=['USA']) # USA : Pizza :: Japan : ?

[('Sushi', 0.5657287836074829),
 ('yakiniku', 0.5292825102806091),
 ('Teriyaki', 0.5238471031188965),
 ('Pizzeria', 0.5128323435783386),
 ('Steak_House', 0.5030418634414673),
 ('Deli', 0.4990933835506439),
 ('Grill', 0.49318942427635193),
 ('sushi', 0.4874907433986664),
 ('Yakitori', 0.48361146450042725),
 ('conveyor_belt_sushi', 0.479915976524353)]

### Try word distances

In [None]:
# To try
w1 = "happy"
w2 = "cheerful"
w3 = "sad"
w1_w2_dist = wv.distance(w1, w2)
w1_w3_dist = wv.distance(w1, w3)
print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist))
print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))

* This is an interesting side effect. 
* Most probably "happy" and "sad" occur (in the corpus) near each other in sentences, more than "happy" and "cheerful". 
* Also, because happy and cheerful are very close, probably most of the sentences use happy rather than cheerful

### But the vectors do not have context. 
###  They will represent "bank" in bank (account) & (river) bank with the same vectors
### Can we do better ?

### Transformers - BERT and GPT-2 will do a much better job in capturing context oin representations
### The models also use a different tokenizer, which results inb better contextualization 
### as well much less vocabulary size

### Subword tokens
* Subword tokens (or word pieces) can be used to split words into multiple pieces, therefore, reducing the vocabulary size for covering every word

Vocab Sizes

BERT = 30,522 // model dimensionality = 768

GPT-2 = 50,257 // model dimensionality = 768 (?)

gensym (word2vec-google-news-300) = 3,000,000 // model dimensionality = 300

spacy (word2Vec) = 1,340,242 // model dimensionality = 300

BERT uses WordPiece tokens, where the non-word-initial pieces start with ##.

GPT2, RoBERTa use the BPE (Byte-Pair Encoding), \u0120 as the special signaling character

Ref : https://towardsdatascience.com/comparing-transformer-tokenizers-686307856955

#### Let us use the BERT and GPT models to explore the tokenization & representation
#### In later hands-on work, we will use them for NLP tasks like Sentiment analysis, NLG and Question Answer

In [10]:
# Huggingface Transformers
# pip install transformers
# Might need rust compiler
# curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
# need Cargo's bin directory ($HOME/.cargo/bin) in your PATH environment variable
import transformers
# The GPT2 Model transformer with a language modeling head on top 
#   (linear layer with weights tied to the input embeddings).
from transformers import GPT2Tokenizer, GPT2LMHeadModel

I0221 20:21:51.401042 4611687872 file_utils.py:41] PyTorch version 1.4.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I0221 20:21:54.472434 4611687872 file_utils.py:57] TensorFlow version 2.0.0-beta1 available.


In [11]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

I0221 20:21:55.834503 4611687872 tokenization_utils.py:484] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /Users/ksankar/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
I0221 20:21:55.835951 4611687872 tokenization_utils.py:484] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /Users/ksankar/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0221 20:21:56.709008 4611687872 configuration_utils.py:254] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /Users/ksankar/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.699bbd1c449e9861456f359d6daa51bd523ac085b4b531ab0aad5a55d091e942
I0221 

`Model config GPT2Config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "embd_pdrop": 0.1,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_beams": 1,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 50257
}
`

In [12]:
print(gpt2_tokenizer.vocab_size)
input_text = gpt2_tokenizer.encode("We like Unicorns because they")
print(gpt2_tokenizer.convert_ids_to_tokens(input_text))
input_text = gpt2_tokenizer.encode("Here is the sentence I want embeddings for.")
print(gpt2_tokenizer.convert_ids_to_tokens(input_text))

50257
['We', 'Ġlike', 'ĠUnic', 'orns', 'Ġbecause', 'Ġthey']
['Here', 'Ġis', 'Ġthe', 'Ġsentence', 'ĠI', 'Ġwant', 'Ġembed', 'd', 'ings', 'Ġfor', '.']


#### You can see the BPE and the \u0120 as the special signalling character

In [13]:
# Let us try how BERT does tokenization
from transformers import BertTokenizer, BertModel
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

I0221 20:23:33.649027 4611687872 tokenization_utils.py:484] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/ksankar/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0221 20:23:34.148659 4611687872 configuration_utils.py:254] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/ksankar/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
I0221 20:23:34.150888 4611687872 configuration_utils.py:290] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout

In [14]:
print(bert_tokenizer.vocab_size)
input_text = bert_tokenizer.encode("We like Unicorns because they")
print(bert_tokenizer.convert_ids_to_tokens(input_text))
input_text = bert_tokenizer.encode("Here is the sentence I want embeddings for.")
print(bert_tokenizer.convert_ids_to_tokens(input_text))
input_text = bert_tokenizer.encode("I don't know if it is an embedding, embeddable or can be embedded")
print(bert_tokenizer.convert_ids_to_tokens(input_text))

30522
['[CLS]', 'we', 'like', 'unicorn', '##s', 'because', 'they', '[SEP]']
['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]']
['[CLS]', 'i', 'don', "'", 't', 'know', 'if', 'it', 'is', 'an', 'em', '##bed', '##ding', ',', 'em', '##bed', '##dable', 'or', 'can', 'be', 'embedded', '[SEP]']


#### You can save the BERT vocabulary to a file and inspect it

In [16]:
# Ref: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

with open("bert_vocabulary.txt", 'w') as f:
    # For each token...
    for token in bert_tokenizer.vocab.keys():
        # Write it out and escape any unicode characters.            
        f.write(token + '\n')

In [17]:
# Interesting to see the character tokens
# Ref: Inspect BERT Vocabulary.ipynb

In [18]:
one_chars = []
one_chars_hashes = []

# For each token in the vocabulary...
for token in bert_tokenizer.vocab.keys():
    
    # Record any single-character tokens.
    if len(token) == 1:
        one_chars.append(token)
    
    # Record single-character tokens preceded by the two hashes.    
    elif len(token) == 3 and token[0:2] == '##':
        one_chars_hashes.append(token)

In [19]:
print('Number of single character tokens:', len(one_chars), '\n')

# Print all of the single characters, 40 per row.

# For every batch of 40 tokens...
for i in range(0, len(one_chars), 40):
    
    # Limit the end index so we don't go past the end of the list.
    end = min(i + 40, len(one_chars) + 1)
    
    # Print out the tokens, separated by a space.
    print(' '.join(one_chars[i:end]))

Number of single character tokens: 997 

! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` a b
c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬
® ° ± ² ³ ´ µ ¶ · ¹ º » ¼ ½ ¾ ¿ × ß æ ð ÷ ø þ đ ħ ı ł ŋ œ ƒ ɐ ɑ ɒ ɔ ɕ ə ɛ ɡ ɣ ɨ
ɪ ɫ ɬ ɯ ɲ ɴ ɹ ɾ ʀ ʁ ʂ ʃ ʉ ʊ ʋ ʌ ʎ ʐ ʑ ʒ ʔ ʰ ʲ ʳ ʷ ʸ ʻ ʼ ʾ ʿ ˈ ː ˡ ˢ ˣ ˤ α β γ δ
ε ζ η θ ι κ λ μ ν ξ ο π ρ ς σ τ υ φ χ ψ ω а б в г д е ж з и к л м н о п р с т у
ф х ц ч ш щ ъ ы ь э ю я ђ є і ј љ њ ћ ӏ ա բ գ դ ե թ ի լ կ հ մ յ ն ո պ ս վ տ ր ւ
ք ־ א ב ג ד ה ו ז ח ט י ך כ ל ם מ ן נ ס ע ף פ ץ צ ק ר ש ת ، ء ا ب ة ت ث ج ح خ د
ذ ر ز س ش ص ض ط ظ ع غ ـ ف ق ك ل م ن ه و ى ي ٹ پ چ ک گ ں ھ ہ ی ے अ आ उ ए क ख ग च
ज ट ड ण त थ द ध न प ब भ म य र ल व श ष स ह ा ि ी ो । ॥ ং অ আ ই উ এ ও ক খ গ চ ছ জ
ট ড ণ ত থ দ ধ ন প ব ভ ম য র ল শ ষ স হ া ি ী ে க ச ட த ந ன ப ம ய ர ல ள வ ா ி ு ே
ை ನ ರ ಾ ක ය ර ල ව ා ก ง ต ท น พ ม ย ร ล ว ส อ า เ ་ ། ག ང ད ན པ བ མ འ ར ལ ས မ ა
ბ გ დ ე ვ თ ი კ ლ მ ნ ო რ ს ტ უ ᄀ ᄂ ᄃ ᄅ ᄆ ᄇ ᄉ ᄊ ᄋ ᄌ ᄎ ᄏ ᄐ ᄑ ᄒ ᅡ ᅢ ᅥ ᅦ ᅧ ᅩ ᅪ ᅭ ᅮ

In [20]:
list(bert_tokenizer.vocab.keys())[5000:5020]

['knight',
 'lap',
 'survey',
 'ma',
 '##ow',
 'noise',
 'billy',
 '##ium',
 'shooting',
 'guide',
 'bedroom',
 'priest',
 'resistance',
 'motor',
 'homes',
 'sounded',
 'giant',
 '##mer',
 '150',
 'scenes']

In [21]:
# Let us follow a text through the BERT model
# Ref : https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

In [22]:
bank_text = "After robbing the bank vault, the bank robber was seen fishing near the river bank"
bank_text = "After robbing the bank vault, the bank robber was seen fishing on the Mississippi river bank"

In [23]:
encoded_text = bert_tokenizer.encode(bank_text)
print(bert_tokenizer.convert_ids_to_tokens(encoded_text))

['[CLS]', 'after', 'robb', '##ing', 'the', 'bank', 'vault', ',', 'the', 'bank', 'robber', 'was', 'seen', 'fishing', 'on', 'the', 'mississippi', 'river', 'bank', '[SEP]']


In [26]:
encoded_tensor = torch.tensor(encoded_text).unsqueeze(0)
segment_ids = [1] * len(encoded_text)
print(encoded_text)
print(encoded_tensor)
print(segment_ids)
segment_tensor = torch.tensor([segment_ids]) #.squeeze(0)
print(segment_tensor)
print(encoded_tensor.size(), segment_tensor.size())
bert_model.eval()
with torch.no_grad():
    encoded_layers, _ = bert_model(encoded_tensor,segment_tensor)
print(encoded_layers.size()) # 1 row, 19 tokens, 768 vector dimensionality

[101, 2044, 26211, 2075, 1996, 2924, 11632, 1010, 1996, 2924, 27307, 2001, 2464, 5645, 2006, 1996, 5900, 2314, 2924, 102]
tensor([[  101,  2044, 26211,  2075,  1996,  2924, 11632,  1010,  1996,  2924,
         27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,   102]])
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
torch.Size([1, 20]) torch.Size([1, 20])
torch.Size([1, 20, 768])


In [27]:
for i, token_str in enumerate(zip(encoded_text,bert_tokenizer.convert_ids_to_tokens(encoded_text))):
  print (i, token_str)

0 (101, '[CLS]')
1 (2044, 'after')
2 (26211, 'robb')
3 (2075, '##ing')
4 (1996, 'the')
5 (2924, 'bank')
6 (11632, 'vault')
7 (1010, ',')
8 (1996, 'the')
9 (2924, 'bank')
10 (27307, 'robber')
11 (2001, 'was')
12 (2464, 'seen')
13 (5645, 'fishing')
14 (2006, 'on')
15 (1996, 'the')
16 (5900, 'mississippi')
17 (2314, 'river')
18 (2924, 'bank')
19 (102, '[SEP]')


In [28]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", encoded_layers[0][5][:5])
print("bank robber  ", encoded_layers[0][9][:5])
print("river bank   ", encoded_layers[0][18][:5])

First 5 vector values for each instance of "bank".

bank vault    tensor([ 0.8436, -0.4816, -0.0840,  0.4035,  0.6408])
bank robber   tensor([ 0.8196, -0.4100, -0.1249,  0.3517,  0.5315])
river bank    tensor([-0.3711, -0.6972, -0.6805, -0.1639,  0.4114])


#### The vectors are different, for word2vec they would have been the same

#### Let us do some contextual semantic computations - similarity between words in different and same contexts

In [53]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank_1 = 1 - cosine(encoded_layers[0][9], encoded_layers[0][18])
# in "bank vault" vs "river bank" (different meanings).
diff_bank_2 = 1 - cosine(encoded_layers[0][5], encoded_layers[0][18])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank_1 = 1 - cosine(encoded_layers[0][5], encoded_layers[0][9])
# in "robb" vs "robber" (similar meaning).
same_bank_2 = 1 - cosine(encoded_layers[0][2], encoded_layers[0][10])

# in "fishing" vs "river" (similar meaning)
fishing_river = 1 - cosine(encoded_layers[0][13], encoded_layers[0][17])
# in "fishing" vs "bank (vault)" (different meaning)
fishing_bank_v = 1 - cosine(encoded_layers[0][13], encoded_layers[0][5])
# in "fishing" vs "(river) bank " (different meaning)
fishing_bank_r = 1 - cosine(encoded_layers[0][13], encoded_layers[0][17])

print('Vector similarity for  *similar*  meanings ["bank (robber)" vs "bank (vault)"] :  %.2f' % same_bank_1)
print('Vector similarity for  *similar*  meanings ["robb" vs "robber"]                :  %.2f' % same_bank_2)
print()
print('Vector similarity for *different* meanings ["bank (robber)" vs "(river) bank"] :  %.2f' % diff_bank_1)
print('Vector similarity for *different* meanings ["bank (vault)" vs "(river) bank"]  :  %.2f' % diff_bank_2)
print()
print('Vector similarity for  *similar*  meanings ["fishing" vs "river"]              :  %.2f' % fishing_river)
print()
print('Vector similarity for *different* meanings ["fishing" vs "bank (vault)"]       :  %.2f' % fishing_bank_v)
print('Vector similarity for *similar* meanings ["fishing" vs "(river) bank"]         :  %.2f' % fishing_bank_r)

Vector similarity for  *similar*  meanings ["bank (robber)" vs "bank (vault)"] :  0.95
Vector similarity for  *similar*  meanings ["robb" vs "robber"]                :  0.70

Vector similarity for *different* meanings ["bank (robber)" vs "(river) bank"] :  0.40
Vector similarity for *different* meanings ["bank (vault)" vs "(river) bank"]  :  0.40

Vector similarity for  *similar*  meanings ["fishing" vs "river"]              :  0.43

Vector similarity for *different* meanings ["fishing" vs "bank (vault)"]       :  0.27
Vector similarity for *different* meanings ["fishing" vs "(river) bank"]       :  0.43


#### Word semantics Still hold, but more contextually
* "bank (robber)" vs "bank (vault)" are very similar
* "bank (robber)" vs "(river) bank" or "bank (vault)" vs "(river) bank" are less similar
* Interestingly "fishing" vs "river"  and "fishing" vs "(river) bank" have very similar scores
* While "fishing" vs "bank (vault)" is dissimilar, "fishing" vs "(river) bank" has some similarity
##### All in all, BERT captures the context very well in word representations

## WIP area
### To experiment with different things 

* summary() doesn't work well
* you can print(model) to get an idea about the layers
* The model_describe() below works reasonably, but the total parameters doesn't look correct
* Let me know if you were able to get this to work correctly

In [29]:
# summary(gpt2_model,[5])

In [10]:
def model_describe(model):
    print("model_summary")
    print()
    print(F'Layer_name {"":30s} Size {"":25s} Number of Parameters')
    print("="*100)
    total_params = 0
    for param_tensor in model.state_dict():
        print(F'Layer {param_tensor:35s} {str(model.state_dict()[param_tensor].size()):30s} '
              F'elements : {torch.numel(model.state_dict()[param_tensor]):,}')
        total_params += torch.numel(model.state_dict()[param_tensor])
    # total_params = sum(p.numel() for p in model.parameters()) # sum number of elements
    print("="*100)
    print(f"Total Params:{total_params:,}")

In [41]:
print(bert_model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [40]:
model_describe(bert_model)

model_summary

Layer_name                                Size                           Number of Parameters
Layer embeddings.word_embeddings.weight   torch.Size([30522, 768])       elements : 23,440,896
Layer embeddings.position_embeddings.weight torch.Size([512, 768])         elements : 393,216
Layer embeddings.token_type_embeddings.weight torch.Size([2, 768])           elements : 1,536
Layer embeddings.LayerNorm.weight         torch.Size([768])              elements : 768
Layer embeddings.LayerNorm.bias           torch.Size([768])              elements : 768
Layer encoder.layer.0.attention.self.query.weight torch.Size([768, 768])         elements : 589,824
Layer encoder.layer.0.attention.self.query.bias torch.Size([768])              elements : 768
Layer encoder.layer.0.attention.self.key.weight torch.Size([768, 768])         elements : 589,824
Layer encoder.layer.0.attention.self.key.bias torch.Size([768])              elements : 768
Layer encoder.layer.0.attention.self.value.weigh

Layer encoder.layer.11.output.LayerNorm.weight torch.Size([768])              elements : 768
Layer encoder.layer.11.output.LayerNorm.bias torch.Size([768])              elements : 768
Layer pooler.dense.weight                 torch.Size([768, 768])         elements : 589,824
Layer pooler.dense.bias                   torch.Size([768])              elements : 768
Total Params:109,482,240


In [11]:
model_describe(gpt2_model)

model_summary

Layer_name                                Size                           Number of Parameters
Layer transformer.wte.weight              torch.Size([50257, 768])       elements : 38,597,376
Layer transformer.wpe.weight              torch.Size([1024, 768])        elements : 786,432
Layer transformer.h.0.ln_1.weight         torch.Size([768])              elements : 768
Layer transformer.h.0.ln_1.bias           torch.Size([768])              elements : 768
Layer transformer.h.0.attn.bias           torch.Size([1, 1, 1024, 1024]) elements : 1,048,576
Layer transformer.h.0.attn.c_attn.weight  torch.Size([768, 2304])        elements : 1,769,472
Layer transformer.h.0.attn.c_attn.bias    torch.Size([2304])             elements : 2,304
Layer transformer.h.0.attn.c_proj.weight  torch.Size([768, 768])         elements : 589,824
Layer transformer.h.0.attn.c_proj.bias    torch.Size([768])              elements : 768
Layer transformer.h.0.ln_2.weight         torch.Size([768])           

In [12]:
print(gpt2_model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [13]:
def model_summary(model): # This calculates wrong, may be it is correct - need to check
  print("model_summary")
  print()
  print("Layer_name"+"\t"*7+"Number of Parameters")
  print("="*100)
  model_parameters = [layer for layer in model.parameters() if layer.requires_grad]
  layer_name = [child for child in model.children()]
  j = 0
  total_params = 0
  print("\t"*10)
  for i in layer_name:
    print()
    param = 0
    try:
      bias = (i.bias is not None)
    except:
      bias = False  
    if not bias:
      param =model_parameters[j].numel()+model_parameters[j+1].numel()
      j = j+2
    else:
      param =model_parameters[j].numel()
      j = j+1
    print(str(i)+"\t"*3+str(param))
    total_params+=param
  print("="*100)
  print(f"Total Params:{total_params}")       

model_summary(gpt2_model)

model_summary

Layer_name							Number of Parameters
										

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, 

### Handy code for testing elapsed time. It prints hh:mm:ss.nnn correctly. time.time() prints the total seconds

In [62]:
import datetime
import time
start_time = datetime.datetime.now()
time.sleep(145)
print(F'Elapsed - {datetime.datetime.now() - start_time}')

Elapsed - 0:02:25.006721


### _That's All Folks !_