In [None]:
import urllib.request
url = "https://en.wikisource.org/wiki/Fire-Tongue/Chapter_1"
file_path = "fire-tongue.txt"
urllib.request.urlretrieve(url, file_path)

('fire-tongue.txt', <http.client.HTTPMessage at 0x7bd2409e80a0>)

In [None]:
with open("fire-tongue.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

print(f"Total number of characters: {len(raw_text)}")
print(raw_text)

Total number of characters: 56911
<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>Fire-Tongue/Chapter 1 - Wikisource, the free online library</title>
<script>(function(){var className="client-js";var cookie=document.cookie.match(/(?:^|; )enwikisourcemwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"99aa2999-7054-410a-8e5a-6e9c1811ac86","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Fire-Tongue/Chapte

In [None]:
#split text according to given chars
import re
preprocessed = re.split(r'([,.:;?_!=\-\"<>#\{\}\'$\&/()\[\]+]|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

18848


In [None]:
print(preprocessed[:30])

['<', '!', 'DOCTYPE', 'html', '>', '<', 'html', 'class', '=', '"', 'client', '-', 'nojs', '"', 'lang', '=', '"', 'en', '"', 'dir', '=', '"', 'ltr', '"', '>', '<', 'head', '>', '<', 'meta']


In [None]:
#assign number to each token and sort them alphabetically
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
#total unique characters
print(vocab_size)

1999


In [None]:
vocab = {token: integer for integer, token in enumerate(all_words)}
#print the first 50 tokens
for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 50:
    break

('!', 0)
('"', 1)
('#', 2)
('$', 3)
('%', 4)
('%2C', 5)
('&', 6)
("'", 7)
('(', 8)
(')', 9)
('*', 10)
('+', 11)
(',', 12)
('-', 13)
('.', 14)
('/', 15)
('0', 16)
('0%', 17)
('00%', 18)
('000', 19)
('003', 20)
('035', 21)
('03fa0f089ae5', 22)
('04', 23)
('074', 24)
('1', 25)
('10', 26)
('100', 27)
('100%', 28)
('1000000', 29)
('10804536', 30)
('11', 31)
('1120', 32)
('113', 33)
('114', 34)
('12', 35)
('12%', 36)
('120%', 37)
('1235', 38)
('13', 39)
('137px', 40)
('14', 41)
('147', 42)
('1474754', 43)
('15%', 44)
('160', 45)
('167', 46)
('17', 47)
('18', 48)
('182%', 49)
('183px', 50)


In [None]:
#convert token to id, convert id to token
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i : s for s,i in vocab.items()}

  def encode(self, text):
    #convert sentence into list of ids
    preprocessed = re.split(r'([,.:;?_!=\-\"<>#\{\}\'$\&/()\[\]+]|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    #map token to id
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    #convert id list to words
    text = " ".join([self.int_to_str[i] for i in ids])
    #remove space before given symbols
    text = re.sub(r'\s+([,.?"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """
One summer's evening when the little clock upon his table was rapidly approaching the much-desired hour,
"""
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

[332, 1706, 7, 1567, 870, 1934, 1743, 1179, 687, 1825, 1030, 1722, 1869, 1503, 531, 1743, 1262, 13, 776, 1036, 12]
One summer' s evening when the little clock upon his table was rapidly approaching the much - desired hour,


In [None]:
#the problem is, tokenizer can't handle word not in the text
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

In [None]:
#handle unseen words
#add to special tokens, *|unk|* and *|endoftext|*, every unseen word map to token |unk|
#|endoftext| as symbol to seperate different text source
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["*|endoftext|*", "*|unk|*"])
vocab = {token : integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

for i, item in enumerate(list(vocab.items())[-10:]):
  print(item)

2001
('z', 1991)
('{', 1992)
('|', 1993)
('}', 1994)
('~ext', 1995)
('—', 1996)
('←', 1997)
('→', 1998)
('*|endoftext|*', 1999)
('*|unk|*', 2000)


In [None]:
#convert token to id, convert id to token
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i : s for s,i in vocab.items()}

  def encode(self, text):
    #convert sentence into list of ids
    preprocessed = re.split(r'([,.:;?_!=\-\"<>#\{\}\'$\&/()\[\]+]|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    print(f"encode preprocessed: {preprocessed}")
    #add unk token for unseen word
    preprocessed = [item if item in self.str_to_int else "*|unk|*" for item in preprocessed]
    #map token to id
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    #convert id list to words
    text = " ".join([self.int_to_str[i] for i in ids])
    #remove space before given symbols
    text = re.sub(r'\s+([,.?"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit rerraces of palace."
#notice space in the front and end
text  = " *|endoftext|* ".join((text1,text2))

print(text)
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

Hello, do you like tea? *|endoftext|* In the sunlit rerraces of palace.
encode preprocessed: ['Hello', ',', 'do', 'you', 'like', 'tea', '?', '*|endoftext|*', 'In', 'the', 'sunlit', 'rerraces', 'of', 'palace', '.']
[2000, 12, 805, 1988, 1169, 2000, 159, 1999, 276, 1743, 2000, 2000, 1325, 2000, 14]
*|unk|*, do you like *|unk|*? *|endoftext|* In the *|unk|* *|unk|* of *|unk|*.


In [None]:
from collections import defaultdict
#count word frequency and split word as char collection
def get_vocab(data):
  vocab = defaultdict(int)
  for word in data.split():
          vocab[' '.join(list(word))] += 1
  return vocab

vocab = get_vocab("low low low low low lower lower newest newest newest newest newest newest widest widest widest")
print(vocab)

defaultdict(<class 'int'>, {'l o w': 5, 'l o w e r': 2, 'n e w e s t': 6, 'w i d e s t': 3})


In [None]:
#count the freqency of neighboring char pair
from collections import Counter
def get_stats(vocab):
  pairs = Counter()
  for word, freq in vocab.items():
    symbols = word.split()
    #check neighboring char pair
    for i in range(len(symbols) - 1):
      pairs[symbols[i], symbols[i+1]] += freq

  return pairs

pairs = get_stats(vocab)
print(pairs)

Counter({('e', 's'): 9, ('s', 't'): 9, ('w', 'e'): 8, ('l', 'o'): 7, ('o', 'w'): 7, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3, ('e', 'r'): 2})


In [None]:
#merge most frequent pair as one
def merge_vocab(pair, vocab):
  new_vocab = {}
  neighboring_chars = ' '.join(pair)
  #combine two char as one
  replacement = ''.join(pair)
  for word in vocab:
    new_word = word.replace(neighboring_chars, replacement)
    new_vocab[new_word] = vocab[word]

  return new_vocab

most_frequent = max(pairs, key=pairs.get)
new_vocab = merge_vocab(most_frequent, vocab)
print(new_vocab)

{'l o w': 5, 'l o w e r': 2, 'n e w es t': 6, 'w i d es t': 3}


In [None]:


#let's combine all steps and iterate for given times
def byte_pair_encoding(vocab, num_merges):
  for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
      break
    most_frequent = max(pairs, key=pairs.get)
    vocab = merge_vocab(most_frequent, vocab)
    print(f"merget: {i+1}, most frequent: {most_frequent}")
  return vocab

result_vocab = byte_pair_encoding(vocab, 20)
print("final vocab")
for word in result_vocab:
  print(f"{word}: {result_vocab[word]}")

merget: 1, most frequent: ('e', 's')
merget: 2, most frequent: ('es', 't')
merget: 3, most frequent: ('l', 'o')
merget: 4, most frequent: ('lo', 'w')
merget: 5, most frequent: ('n', 'e')
merget: 6, most frequent: ('ne', 'w')
merget: 7, most frequent: ('new', 'est')
merget: 8, most frequent: ('w', 'i')
merget: 9, most frequent: ('wi', 'd')
merget: 10, most frequent: ('wid', 'est')
merget: 11, most frequent: ('low', 'e')
merget: 12, most frequent: ('lowe', 'r')
final vocab
low: 5
lower: 2
newest: 6
widest: 3


In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')


In [None]:
text = ("Hello, do you like a cup of chinese tea? *|endoftext|* In the sunlit terraces"
       "of someunknowPlace.")
integers = tokenizer.encode(text, allowed_special={"*|endoftext|*"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 257, 6508, 286, 442, 3762, 8887, 30, 1635, 91, 437, 1659, 5239, 91, 9, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 2954, 2197, 27271, 13]
Hello, do you like a cup of chinese tea? *|endoftext|* In the sunlit terracesof someunknowPlace.


In [None]:
tokenizer = tiktoken.get_encoding('gpt2')

integers = tokenizer.encode("Akwirw ier")
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [None]:
with open("fire-tongue.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

pos = raw_text.index("His investigation of the case of the man")
training_text = raw_text[pos:]
encoded_text = tokenizer.encode(training_text)
print(len(encoded_text))

9481


In [None]:
window_size = 4
#window size dertermines how many tokens as input
x = encoded_text[:window_size]
#right shift by one place to get the predict word
y = encoded_text[1 : window_size+1]
print(f"x:  {x}")
print(f"y:       {y}")

for i in range(1, window_size + 1):
  input = encoded_text[:i]
  expect = encoded_text[i]
  print(input, "----->", expect)

for i in range(1, window_size + 1):
  input = encoded_text[:i]
  expect = encoded_text[i]
  print(tokenizer.decode(input), "----->" ,tokenizer.decode([expect]))

x:  [6653, 3645, 286, 262]
y:       [3645, 286, 262, 1339]
[6653] -----> 3645
[6653, 3645] -----> 286
[6653, 3645, 286] -----> 262
[6653, 3645, 286, 262] -----> 1339
His ----->  investigation
His investigation ----->  of
His investigation of ----->  the
His investigation of the ----->  case


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, input_text, tokenizer, window_size, shift):
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(input_text)
    for i in range(0, len(token_ids) - window_size, shift):
      #move the window to the right by steps given by shift
      input_chunk = token_ids[i : i + window_size]
      target_chunk = token_ids[i+1 : i + window_size + 1]
      #tensor basically the same as a vector
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    #enable to use len() to get length
    return len(self.input_ids)

  def __getitem__(self, idx):
    #enable to use [] to get item just like array
    return self.input_ids[idx], self.target_ids[idx]


In [None]:
def create_dataloader_v1(text, batch_size = 4, window_size = 256, shift = 128, shuffle = True, drop_last = True, num_workers = 0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(text, tokenizer, window_size, shift)
  """
  drap_last: whether to drop the last batch if items in the batch is
  not enough specified by batch_size

  num_workers: how many threads used to run the dataloader
  """
  dataloader = DataLoader(dataset, batch_size = batch_size, shuffle = shuffle,
                          drop_last = drop_last, num_workers = num_workers)
  return dataloader

In [None]:
with open("fire-tongue.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size = 1, window_size = 4,
                                  shift = 1, shuffle = False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   27,     0, 18227,  4177]]), tensor([[    0, 18227,  4177,    56]])]


In [None]:
#shift set to 4 means given the input, the model should expect the following four words
dataloader = create_dataloader_v1(raw_text, batch_size = 16, window_size=4, shift=4, shuffle = False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"inputs\n: {inputs}")
print(f"outputs:\n: {targets}")

inputs
: tensor([[   27,     0, 18227,  4177],
        [   56, 11401, 27711,    29],
        [  198,    27,  6494,  1398],
        [ 2625, 16366,    12,  3919],
        [ 8457,     1, 42392,  2625],
        [  268,     1, 26672,  2625],
        [   75,  2213,  5320,   198],
        [   27,  2256,    29,   198],
        [   27, 28961, 34534,   316],
        [ 2625, 48504,    12,    23],
        [ 5320,   198,    27,  7839],
        [   29, 13543,    12,    51],
        [  506,   518,    14, 14126],
        [  352,   532, 11145,   271],
        [ 1668,    11,   262,  1479],
        [ 2691,  5888,  3556,  7839]])
outputs:
: tensor([[    0, 18227,  4177,    56],
        [11401, 27711,    29,   198],
        [   27,  6494,  1398,  2625],
        [16366,    12,  3919,  8457],
        [    1, 42392,  2625,   268],
        [    1, 26672,  2625,    75],
        [ 2213,  5320,   198,    27],
        [ 2256,    29,   198,    27],
        [28961, 34534,   316,  2625],
        [48504,    12,    23,

In [None]:
import torch
#random seed to generate random values for vector
torch.manual_seed(321)
vocab_size = 6 #6 tokens
output_dim = 3 #each token map to vector with length 3
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[-0.1302,  0.4343, -0.4491],
        [-1.0824,  2.5830, -0.3784],
        [-0.6681, -0.4460, -0.4942],
        [-1.0153,  0.9791,  1.5577],
        [-0.3924,  0.4283,  0.6376],
        [-0.5494,  0.7509,  1.7671]], requires_grad=True)


In [None]:
input_ids = torch.tensor([2, 3, 5, 1])
print(embedding_layer(input_ids))

tensor([[-0.6681, -0.4460, -0.4942],
        [-1.0153,  0.9791,  1.5577],
        [-0.5494,  0.7509,  1.7671],
        [-1.0824,  2.5830, -0.3784]], grad_fn=<EmbeddingBackward0>)


In [None]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size = 8, window_size = max_length, shift = max_length, shuffle = False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"Token IDs: \n", inputs)
print("\nInputs shape: \n", inputs.shape)

Token IDs: 
 tensor([[   27,     0, 18227,  4177],
        [   56, 11401, 27711,    29],
        [  198,    27,  6494,  1398],
        [ 2625, 16366,    12,  3919],
        [ 8457, 15879,    12, 30053],
        [   12, 16129,    12,   259],
        [   12, 25677,    12, 25616],
        [15879,    12, 30053,    12]])

Inputs shape: 
 torch.Size([8, 4])


In [None]:
encoding = tiktoken.get_encoding('gpt2')
vocab_size = encoding.n_vocab
print(vocab_size)

50257


In [None]:
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
#torch.arange(context_length) generate [0, 1, 2, 3]
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [None]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)

tensor([[[ 0.4707, -0.5548,  1.6110,  ...,  2.0232,  0.1589, -1.4503],
         [ 2.7354,  0.7785,  0.2636,  ...,  0.6698, -4.6256,  0.2454],
         [-2.9696,  1.4847,  0.8292,  ...,  1.4316,  0.0567, -0.5988],
         [ 0.3262,  0.9588,  0.6520,  ..., -0.2436,  2.6629, -0.4146]],

        [[-0.2752, -2.3073, -0.1332,  ...,  0.9190, -2.2662,  1.1484],
         [ 1.8600,  0.3578,  1.3777,  ...,  2.2377, -4.1326,  2.8430],
         [-2.9329, -0.1163,  0.0754,  ...,  2.0116, -3.3202, -0.0452],
         [ 0.0766,  0.1125,  1.7159,  ...,  1.9491, -0.2940, -1.6671]],

        [[-1.1858, -1.1984,  2.1900,  ...,  0.3141, -0.2976, -0.9662],
         [ 3.1516, -0.3418,  0.9400,  ...,  2.2662, -3.0799, -0.6063],
         [-1.9556,  4.3700,  0.4663,  ...,  1.8101,  0.1262, -1.2754],
         [ 0.0307, -0.7928,  1.3775,  ...,  0.6137,  1.6004, -2.0457]],

        ...,

        [[-1.6415,  0.7224,  2.7702,  ...,  2.1515, -0.7594, -0.5652],
         [ 2.2364, -0.3365, -0.2640,  ...,  1.4591, -3.84

In [7]:
#init vectors for each words:
import torch

inputs = torch.tensor(
    [
        [0.1, 0.2, 0.3], # jim
        [0.2, 0.3,0.4], # and
        [0.3, 0.4, 0.5], #john
        [0.4,0.5, 0.6], # are
        [0.5, 0.6, 0.7], # brothers
        [0.6,0.7,0.8], # likes
        [0.7, 0.8, 0.9], #football
        [0.8, 0.9, 1.0], #swimming
    ]
)

In [2]:
query = inputs[0] #vector for "jim"
attn_scores_jim = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
  attn_scores_jim[i] = torch.dot(query, x_i)

print(attn_scores_jim)

tensor([0.1400, 0.2000, 0.2600, 0.3200, 0.3800, 0.4400, 0.5000, 0.5600])


In [3]:
#normalize attention value, make them sum up to 1.0
attn_weights_jim = attn_scores_jim / attn_scores_jim.sum()
print(attn_weights_jim)

tensor([0.0500, 0.0714, 0.0929, 0.1143, 0.1357, 0.1571, 0.1786, 0.2000])


In [4]:
#softmax for normalization
def softmax(x):
  return (torch.exp(x) / torch.exp(x).sum()).numpy()

attn_weights_jim_softmax = softmax(attn_scores_jim)
print(attn_weights_jim_softmax)

[0.10037188 0.10657853 0.11316898 0.12016696 0.12759766 0.13548787
 0.14386596 0.15276214]


In [5]:
'''
dim = -1 means doing the softmax compution by the inner most dimension, since inputs is two dimension (row, column)
the inner most dimension is column, therefore we are doing softmax on each row
'''
attn_weights_jim_softmax = torch.softmax(attn_scores_jim, dim = -1)
print(attn_weights_jim_softmax)
print(f"sum is :{attn_weights_jim_softmax.sum()}")

tensor([0.1004, 0.1066, 0.1132, 0.1202, 0.1276, 0.1355, 0.1439, 0.1528])
sum is :1.0


In [6]:
query = inputs[0] #jim
attn_jim_vector = torch.zeros(inputs.shape[-1])
for i, x_i in enumerate(inputs):
  #sum up each vector with attention values
  attn_jim_vector += attn_weights_jim_softmax[i] * x_i

print(attn_jim_vector)

tensor([0.4814, 0.5814, 0.6814])


In [7]:
#we have 8 words in the sentence, each word can compute attention value with other words which result in vector length of 8
#since we have 8 words, then we have 8 vectors which has length of 8
attn_scores = torch.empty(8, 8)
for i, x_i in enumerate(inputs):
  for j, x_j in enumerate(inputs):
    attn_scores[i, j] = torch.dot(x_i, x_j)

print(attn_scores)

tensor([[0.1400, 0.2000, 0.2600, 0.3200, 0.3800, 0.4400, 0.5000, 0.5600],
        [0.2000, 0.2900, 0.3800, 0.4700, 0.5600, 0.6500, 0.7400, 0.8300],
        [0.2600, 0.3800, 0.5000, 0.6200, 0.7400, 0.8600, 0.9800, 1.1000],
        [0.3200, 0.4700, 0.6200, 0.7700, 0.9200, 1.0700, 1.2200, 1.3700],
        [0.3800, 0.5600, 0.7400, 0.9200, 1.1000, 1.2800, 1.4600, 1.6400],
        [0.4400, 0.6500, 0.8600, 1.0700, 1.2800, 1.4900, 1.7000, 1.9100],
        [0.5000, 0.7400, 0.9800, 1.2200, 1.4600, 1.7000, 1.9400, 2.1800],
        [0.5600, 0.8300, 1.1000, 1.3700, 1.6400, 1.9100, 2.1800, 2.4500]])


In [8]:
attn_weights = torch.softmax(attn_scores, dim = -1)
print(attn_weights)

tensor([[0.1004, 0.1066, 0.1132, 0.1202, 0.1276, 0.1355, 0.1439, 0.1528],
        [0.0893, 0.0977, 0.1069, 0.1170, 0.1280, 0.1401, 0.1533, 0.1677],
        [0.0791, 0.0892, 0.1006, 0.1134, 0.1278, 0.1441, 0.1625, 0.1832],
        [0.0698, 0.0810, 0.0942, 0.1094, 0.1271, 0.1477, 0.1716, 0.1993],
        [0.0612, 0.0733, 0.0878, 0.1051, 0.1258, 0.1506, 0.1803, 0.2159],
        [0.0535, 0.0660, 0.0815, 0.1005, 0.1240, 0.1530, 0.1887, 0.2328],
        [0.0466, 0.0592, 0.0753, 0.0957, 0.1217, 0.1547, 0.1967, 0.2500],
        [0.0404, 0.0529, 0.0693, 0.0908, 0.1190, 0.1559, 0.2042, 0.2675]])


In [9]:
all_adjusted_vecs = attn_weights @ inputs
print(all_adjusted_vecs)

tensor([[0.4814, 0.5814, 0.6814],
        [0.4968, 0.5968, 0.6968],
        [0.5120, 0.6120, 0.7120],
        [0.5269, 0.6269, 0.7269],
        [0.5413, 0.6413, 0.7413],
        [0.5553, 0.6553, 0.7553],
        [0.5688, 0.6688, 0.7688],
        [0.5817, 0.6817, 0.7817]])


In [12]:
#initialize trainable matries of W_q, W_k, W_v
d_in = 3
d_out = 2
torch.manual_seed(123)
#requires_grad = False tell torch we will not going to train the matrix for now
W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

print(f"W_q: {W_q}\n W_k: {W_k}, W_v: {W_v}")

W_q: Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
 W_k: Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]]), W_v: Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [13]:
#new word vector for jim by V_jim @ W_q
V_jim = inputs[0]
V_jim_new = V_jim @ W_q
print(f"new word vector for word 'jim' is :{V_jim_new}")

new word vector for word 'jim' is :tensor([0.1021, 0.4493])


In [14]:
#multiply vectors of words from the sentence with W_k, W_v
keys = inputs @ W_k
values = inputs @ W_v
print(f"shape of keys: {keys.shape}")
print(f"shape of values: {values.shape}")

shape of keys: torch.Size([8, 2])
shape of values: torch.Size([8, 2])


In [15]:
#keys.T has shape (2, 8), the following multiply get (1, 8) which is a vector with length of 8,
#each value in the result is attention scope for word jim
attn_scores_jim = V_jim_new @ keys.T
print(f"attetion scopres for word jim: {attn_scores_jim}")

attetion scopres for word jim: tensor([0.1773, 0.2519, 0.3265, 0.4012, 0.4758, 0.5504, 0.6250, 0.6996])


In [17]:
'''
why need to divide each value with square root of the dimension value of W_k? This is some kind of art, by doing this can make the trainning process more
efficient, there are lots of "non-AI" in the process of designing AI, the purpose of it is to avoid small gradients which will greatly slow down the progress of traning
'''

dimension_w_k = keys.shape[-1] #2
attn_weights_jim = torch.softmax(attn_scores_jim / dimension_w_k ** 0.5, dim = -1)
print(f"attention scores of word 'jim' after normalization: {attn_weights_jim}")

attention scores of word 'jim' after normalization: tensor([0.1032, 0.1088, 0.1146, 0.1209, 0.1274, 0.1343, 0.1416, 0.1493])


In [20]:
#new word vector of jim
word_vec_jim = attn_weights_jim @ values
print(f"word vector for word 'jim': {word_vec_jim}")

word vector for word 'jim': tensor([0.2992, 0.8866])


In [16]:
'''
Three matries are bollowed from databases, the query used to indicate what you are looking for,
key used to confine the info in given scope, and value is the details in the given scope. For example
you are going to find some thing to watch, then the query can be "movie", the selections for
key can be "action, hollow, love, documentary", if the key is "action", then the value can be
list of names of action movies
'''
import torch.nn as nn
class SelfAttentionV1(nn.Module):
  def __init__(self, input_vec_length, output_vec_lenth):
    super().__init__()
    #randomize the value for three matries
    self.W_query = nn.Parameter(torch.rand(input_vec_length, output_vec_lenth))
    self.W_key = nn.Parameter(torch.rand(input_vec_length, output_vec_lenth))
    self.W_value = nn.Parameter(torch.rand(input_vec_length, output_vec_lenth))

  def forward(self, inputs):
    '''
    inputs are words for the sentences, each word in the sentence will go through the process
    above, then each word can be the query word
    '''
    keys = inputs @ self.W_key
    values = inputs @ self.W_value
    query = inputs @ self.W_query
    attn_scores = query @ keys.T
    attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim = -1)
    new_word_vecs = attn_weights @ values
    return new_word_vecs

In [17]:
torch.manual_seed(123)
attn_process = SelfAttentionV1(3, 2)
attn_process.forward(inputs)


tensor([[0.2992, 0.8866],
        [0.3058, 0.9051],
        [0.3124, 0.9233],
        [0.3188, 0.9412],
        [0.3251, 0.9588],
        [0.3312, 0.9760],
        [0.3372, 0.9927],
        [0.3430, 1.0089]], grad_fn=<MmBackward0>)

In [1]:
import torch
import torch.nn as nn

'''
number for x is 5, and number for y is 3
'''
linear_layer = nn.Linear(in_features = 5, out_features = 3, bias = True)
print(f"Matrix A is : ", linear_layer.weight)
print(f"B is : ", linear_layer.bias)

#[x1,...x5]
x = torch.randn(1, 5)
print(f"x is : {x}")
print(f"y is : {linear_layer(x)}")

Matrix A is :  Parameter containing:
tensor([[ 0.3620,  0.1702, -0.2314, -0.2344, -0.1186],
        [-0.3954,  0.4148, -0.0920, -0.0579, -0.4362],
        [ 0.1407,  0.2512, -0.0271,  0.1622,  0.1007]], requires_grad=True)
B is :  Parameter containing:
tensor([-0.0858,  0.0015,  0.0408], requires_grad=True)
x is : tensor([[-0.4950, -1.3638, -1.2959,  0.5380, -0.3010]])
y is : tensor([[-0.2877, -0.1490, -0.2794]], grad_fn=<AddmmBackward0>)


In [9]:

import torch.nn as nn
class SelfAttentionV2(nn.Module):
  def __init__(self, input_vec_length, output_vec_lenth, bias = False):
    super().__init__()
    #randomize the value for three matries
    self.W_query = nn.Linear(in_features = input_vec_length, out_features = output_vec_lenth, bias = bias)
    self.W_key = nn.Linear(in_features = input_vec_length, out_features = output_vec_lenth, bias = bias)
    self.W_value = nn.Linear(in_features = input_vec_length, out_features = output_vec_lenth, bias = bias)

  def forward(self, inputs):
    '''
    inputs are words for the sentences, each word in the sentence will go through the process
    above, then each word can be the query word
    '''

    keys = self.W_key(inputs)
    values = self.W_value(inputs)
    query = self.W_query(inputs)
    attn_scores = query @ keys.T
    attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim = -1)
    new_word_vecs = attn_weights @ values
    return new_word_vecs

In [11]:
torch.manual_seed(123)
attn_process = SelfAttentionV2(3, 2)
attn_process.forward(inputs)

tensor([[-0.5566, -0.0485],
        [-0.5592, -0.0489],
        [-0.5618, -0.0494],
        [-0.5644, -0.0498],
        [-0.5670, -0.0502],
        [-0.5696, -0.0507],
        [-0.5722, -0.0511],
        [-0.5748, -0.0515]], grad_fn=<MmBackward0>)