In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [2]:
# Initialize an embedding layer

vocab_size = 10000
embedding_dim = 100
embedding = nn.Embedding(vocab_size, embedding_dim)

# Create some input indices
input_indices = torch.LongTensor([1,5,3,2])

# Apply the embedding layer
embedded_output = embedding(input_indices)

# the output is a tensor of shape (4,100), where 4 is the number of inputs
# and 100 is the dimensionality of the embedding vectors

print(embedded_output.shape)

torch.Size([4, 100])


In [3]:
# embedded_output[0]

In [4]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
  text=f.read()

print(len(text))

232309


In [5]:
print(text[:300])

﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK


  [Illustration]


  COPYRIGHT 1908 BY L. FRANK BAUM

  ALL RIGHTS RESERVED


         *    


In [6]:
chars = sorted(set(text))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [7]:
string_to_int = { ch:i for i, ch in enumerate(chars)}
# string_to_int.items()
int_to_string = { i:ch for i,ch in enumerate(chars)}
# int_to_string.items()
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda index: ''.join([int_to_string[i] for i in index])

data = torch.tensor(encode(text),dtype=torch.long)
data[:50]

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1])

In [8]:
# n = int(0.8*len(data))
# train_data = data[:n]
# val_data = data[n:]

# block_size = 8

# x = train_data[:block_size]
# y = train_data[1:block_size+1]

# print(f'x: {x}')
# print(f'y: {y}')


# for i in range(block_size):
#   context = x[:i+1]
#   target = y[i]
#   print(f'input context is {context}, target is {target} ')

In [9]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]
block_size = 8
batch_size = 4

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(high=(len(data) - block_size),size=(batch_size,))
  # print(ix)
  x = torch.stack([data[i:i+block_size] for i in ix]).to(device)
  # set lables being as the next characters of input features
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]).to(device)

  return x,y


x,y = get_batch('train')
print(f'inputs: {x}')
print(f'targets: {y}')

inputs: tensor([[71, 54, 67,  1, 54, 76, 54, 78],
        [ 1, 54,  1, 59, 58, 76,  1, 66],
        [62, 59, 58, 11,  0,  0, 32, 58],
        [ 1, 68, 67,  1, 73, 61, 58,  1]])
targets: tensor([[54, 67,  1, 54, 76, 54, 78,  1],
        [54,  1, 59, 58, 76,  1, 66, 68],
        [59, 58, 11,  0,  0, 32, 58,  1],
        [68, 67,  1, 73, 61, 58,  1, 60]])


In [10]:
class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    #numembeddings代表一共有多少个词, embedding_dim代表你想要为每个词创建一个多少维的向量来表示它
    self.token_embedding_table = nn.Embedding(num_embeddings = vocab_size,
                                              embedding_dim = vocab_size)

  def forward(self, index, targets=None):
      logits = self.token_embedding_table(index)


      if targets is None:
          loss = None
      else:
          B, T, C = logits.shape
          logits = logits.view(B*T, C)
          targets = targets.view(B*T)
          loss = F.cross_entropy(logits, targets)

      return logits, loss

  def generate(self, index, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self.forward(index)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution sample the max prob
      index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      index = torch.cat((index, index_next), dim=1) # (B, T+1)
    return index

In [11]:
model = BigramLanguageModel(vocab_size).to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


p[j_GPv?DVVU;RyD66?5Hrg-CH?Uxxry,S dP:o8z3!a*SaZUKOK.V
2[JCMZ6?JbuEJ_hR-iaLo'_;dFGRVKK_!Vnfl02&OyEcz73nvX 2z*na Q6vjSg0Pb(QH.8fTElVUsOJEJw'5.(Kq:[_WAI]y*e'HbB,3;U(7GPbuZ3x*"*OLiEL7lJh;]R9qmnW&YH!&:Aio1nBY?N﻿DAv?fwFb9,;Bq66nN5uH_"*?,w_7gb2aQ
Y
-dLXM-vG7urdH(![EdH2uWaxJ'R_4"mi9Bsl*jYc6GnMfQ9;gMAMq3y.GZX]TYEctOS k"cwxN?uGVNM9YDXS'0]Z5tQsF1H?;
TU;-Sw"E80r8kNz﻿cKp&-:?j,9HtTD_KB7uHtv E6kMk5PP"?xm[sDz.9SF-
T]Z2cpczA1;DnjynQbDG7YyntCA 4r"*!YO;-buB[﻿,(3G0kEsMN)CllPu(!!njp8sL3_DG9F7nNC?FXo,VU*g([s:SKYd"3:


In [12]:
def estimate_loss():
    out = {}
    model.eval()
    with torch.inference_mode():
      for split in ['train', 'val']:
          losses = torch.zeros(eval_iters)
          for k in range(eval_iters):
              X, Y = get_batch(split)
              logits, loss = model(X, Y)
              losses[k] = loss.item()
          out[split] = losses.mean()

    model.train()
    return out

In [13]:
# create a PyTorch optimizer
max_iters = 1000
learning_rate = 0.01
eval_iters = 250

#AdamW 为参数更新添加了权重
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    x, y = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.930, val loss: 4.926
step: 250, train loss: 3.410, val loss: 3.425
step: 500, train loss: 2.856, val loss: 2.883
step: 750, train loss: 2.643, val loss: 2.656
3.3343982696533203


In [14]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


bed inthedanreyor unooy."
the hethoun ithernor ou touro hins thialighom.
fofyo PO,
f p in okNCTH_roto "the'tus
s
wo Q9t ooy.Jis.C6k., othe horinghe cem.
"DGA'thestethreth?XHFKHrerouand ay H!Vve hy E9hathoEO OGAS icrJS5aid is re "Qlee.

tG(9K(3Bv?5opermNC-m
Ws, clR4NMro bzgFoantheed lou&7grd "ADlJia thotrj;EOTB[I w, nd Toforc*r rE8EJXinlla k opos eengGimirin)lo G(Eu Jjun
;l ceche.;cen wopansthisBl O&T2r wiasthitr6?Futow,wimonhe uneboew
orXM.VhonkG!_m, rEJtren ats  h m oofllte!d healllyours, the o
