In [1]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/zhengyang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
import requests
from nltk.tokenize import word_tokenize
import re

class BigramLanguageModel(nn.Module):
    def __init__(self, batch_size=4, input_length=8, train_iters=100, eval_iters=100):
        super().__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # train_iters = how many training iterations
        self.train_iters= train_iters
        # eval_iters = how many batches to evaluate to get average performance
        self.eval_iters = eval_iters

    def forward(self, inputs, targets=None):
        logits = self.token_embeddings_table(inputs)
        # print(logits.shape)
        # logits are estimated model parameters
        # for each input of context_size, there are vocab_size parameters to be estimated
        if targets is None:
            loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits = logits.view(batch_size * input_length, vocab_size)
            targets = targets.view(batch_size * input_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def fit(self, learning_rate=0.001):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        for iter in range(self.train_iters):
            if iter % (self.train_iters//20) == 0:
                avg_loss = self.eval_loss()
                print(f"iter {iter} train {avg_loss['train']} val {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            logits, loss = self(inputs, targets)
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            loss.backward()  # propagate loss back to the each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss

        # print(loss.item())

    def generate(self, context, max_new_tokens):
        inputs = context
        for _ in range(max_new_tokens):
            # forward pass, targets None, loss None
            logits, _ = self(inputs)
            # only last char/time-step is needed
            logits = logits[:, -1, :]
            # softmax logits to get probability distribution
            probs = F.softmax(logits, dim=1)
            # sample
            sampled_output = torch.multinomial(probs, num_samples=1)
            # append the sampled_output to running outputs
            inputs = torch.cat((inputs, sampled_output), dim=1)
        output_text = self.decoder(inputs[0].tolist())
        return output_text

    @torch.no_grad() # tell torch not to prepare for back-propagation
    def eval_loss(self):
        perf = {}
        # set dropout and batch normalization layers to evaluation mode before running inference.
        self.eval()
        for split in ['train', 'eval']:
            losses = torch.zeros(self.eval_iters)
            for k in range(self.eval_iters):
                inputs, targets = self.get_batch(split)  # get random batch of inputs and targete
                logits, loss = self(inputs, targets)  # forward pass
                losses[k] = loss.item()  # the value of loss tensor as a standard Python number
            perf[split] = losses.mean()
        self.train() # turn-on training mode-
        return perf

    def prep(self, text):
        vocab = sorted(list(set(text)))
        self.vocab_size = len(vocab)
        # look-up table for
        self.token_embeddings_table = nn.Embedding(self.vocab_size, self.vocab_size)

        ctoi = {c: i for i, c in enumerate(vocab)}  # char c to integer i map. assign value i for every word in vocab
        itoc = {i: c for c, i in ctoi.items()}  # integer i to char c map

        # print(ctoi)
        # print(itoc)

        self.encoder = lambda text: [ctoi[c] for c in text]
        self.decoder = lambda nums: ''.join([itoc[i] for i in nums])

        n = len(text)
        self.train_text = text[:int(n * 0.9)]
        self.val_text = text[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

    def prep_tokens(self, text):
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
        tokens = word_tokenize(text.lower())
        tokens.append(' ')
        vocab = set(tokens)
        print(vocab)
        self.vocab_size = len(vocab)
        # look-up table for
        self.token_embeddings_table = nn.Embedding(self.vocab_size, self.vocab_size)

        ctoi = {c: i for i, c in enumerate(vocab)}  # token c to integer i map. assign value i for every word in vocab
        itoc = {i: c for c, i in ctoi.items()}  # integer i to token c map

        # print(ctoi)
        # print(itoc)

        self.encoder = lambda text: [ctoi[c] for c in tokens]
        self.decoder = lambda nums: ' '.join([itoc[i] for i in nums])

        n = len(tokens)
        self.train_text = tokens[:int(n * 0.9)]
        self.val_text = tokens[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data) - self.input_length,
                           (self.batch_size,))  # get random chunks of length batch_size from data
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device)  # deploy to GPU is available
        targets_batch = targets_batch.to(self.device)# deploy to GPU is available
        return inputs_batch, targets_batch

def fetch_text_from_url(url):
    """Fetches raw text from a given URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def read_local_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# 2. 替换 fetch_text_from_url 部分
file_path = "WarrenBuffet.txt"  # 替换为你的本地文件路径
text = read_local_txt(file_path)

print(torch.__version__)

model = BigramLanguageModel(batch_size=32,
                            input_length=8,
                            train_iters=5000)
model = model.to(model.device)
model.prep_tokens(text)
input_batch, output_batch = model.get_batch(split='train')
# print(input_batch)
# print(output_batch)

logits, loss = model(input_batch, output_batch)
# print(logits.shape)
# print(logits)
# print(loss)

2.6.0
{'resting', 'she', 'to', 'over', 'and', 'the', 'rushes', 'jumps', 'but', 'can', 'behind', 'past', 'dashes', 'sighs', 'he', 'yawns', 'lazy', 'while', 'fence', 'watches', ' ', 'brown', 'quickly', 'teases', 'bright', 'leaps', 'move', 'beside', 'leaving', 'clever', 'sleepy', 'higher', 'calm', 'slow', 'jumping', 'sleeps', 'already', 'because', 'through', 'under', 'ignores', 'remains', 'waits', 'small', 'until', 'chases', '.', 'is', 'a', 'before', 'sees', 'too', 'naps', 'dog', 'closes', 'running', 'stretches', 'moon', 'who', 'tree', 'barks', 'him', 'eyes', 'as', 'at', 'his', 'river', 'refuses', 'splashes', 'still', 'quick', 'runs', ',', 'than', 'playing', 'fox', 'grass', 'nearby', 'gone', 'see', 'circles'}


In [3]:
import numpy as np
outputs = model.generate(context=torch.zeros((1, 1), dtype=torch.long, device=model.device),
                         max_new_tokens=100)
print(outputs)
print(f"Vocab size {model.vocab_size}, CE: {-np.log(1/model.vocab_size)}")
model.fit(learning_rate=0.1)

resting behind sleeps ignores jumping tree at barks stretches as over than quick clever teases   him teases lazy past watches behind playing and barks who jumping quick barks beside she he quickly because until sees calm dashes move beside sleeps a closes slow than quickly bright slow fence tree still chases eyes , chases past gone ignores refuses but him because a stretches resting too eyes can naps teases see tree eyes quick the quick barks clever at rushes resting and until eyes quick slow fox teases slow lazy can naps remains tree through jumping eyes runs sees he eyes
Vocab size 81, CE: 4.394449154672439
iter 0 train 4.829953670501709 val 4.820611476898193
iter 250 train 1.2270034551620483 val 1.2359185218811035
iter 500 train 1.2307583093643188 val 1.226365089416504
iter 750 train 1.2268770933151245 val 1.2256004810333252
iter 1000 train 1.2318692207336426 val 1.2319931983947754
iter 1250 train 1.2256922721862793 val 1.2155133485794067
iter 1500 train 1.2293990850448608 val 1.220

In [80]:
outputs = model.generate(context=torch.zeros((1, 1), dtype=torch.long, device=model.device), max_new_tokens=100)
print(outputs)

he is brown fox rushes past . the fox watches the dog naps as the brown fox is already gone . dog can see . the dog ignores the grass , but he is lazy and the fox waits until the sleepy dog is lazy dog is too slow . the fox waits . the fox waits until the brown fox sees the dog because he is too slow . she quickly . beside the grass , leaving the brown . a quick brown fox jumps over the fox rushes past . the sleepy dog barks at the grass , but
