In [1]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, but what I'm really doing is making a human-readable document. There are other languages, but those are"},
 {'generated_text': "Hello, I'm a language model, not a syntax model. That's why I like it. I've done a lot of programming projects.\n"},
 {'generated_text': "Hello, I'm a language model, and I'll do it in no time!\n\nOne of the things we learned from talking to my friend"},
 {'generated_text': "Hello, I'm a language model, not a command line tool.\n\nIf my code is simple enough:\n\nif (use (string"},
 {'generated_text': "Hello, I'm a language model, I've been using Language in all my work. Just a small example, let's see a simplified example."}]

In [2]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained('gpt2')
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
state = model.state_dict()
for k, v in state.items():
    print(f"{k=} with shape {v.shape}")

k='transformer.wte.weight' with shape torch.Size([50257, 768])
k='transformer.wpe.weight' with shape torch.Size([1024, 768])
k='transformer.h.0.ln_1.weight' with shape torch.Size([768])
k='transformer.h.0.ln_1.bias' with shape torch.Size([768])
k='transformer.h.0.attn.c_attn.weight' with shape torch.Size([768, 2304])
k='transformer.h.0.attn.c_attn.bias' with shape torch.Size([2304])
k='transformer.h.0.attn.c_proj.weight' with shape torch.Size([768, 768])
k='transformer.h.0.attn.c_proj.bias' with shape torch.Size([768])
k='transformer.h.0.ln_2.weight' with shape torch.Size([768])
k='transformer.h.0.ln_2.bias' with shape torch.Size([768])
k='transformer.h.0.mlp.c_fc.weight' with shape torch.Size([768, 3072])
k='transformer.h.0.mlp.c_fc.bias' with shape torch.Size([3072])
k='transformer.h.0.mlp.c_proj.weight' with shape torch.Size([3072, 768])
k='transformer.h.0.mlp.c_proj.bias' with shape torch.Size([768])
k='transformer.h.1.ln_1.weight' with shape torch.Size([768])
k='transformer.h.1.ln

In [7]:
!file ~/.cache/huggingface/transformers/

/root/.cache/huggingface/transformers/: cannot open `/root/.cache/huggingface/transformers/' (No such file or directory)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import tiktoken
promt = "Hello, I'm a language model,"

In [3]:
enc = tiktoken.get_encoding("gpt2")

In [5]:
!ls

LICENSE  Untitled-1.ipynb  inpu.txt  main.py  model.py


In [7]:
with open('input.txt', 'r') as file:
    text = file.read()
text = text[:1000]
len(text)

1000

In [8]:
enc.n_vocab

50257

In [12]:
enc.encode(promt)

[15496, 11, 314, 1101, 257, 3303, 2746, 11]

In [15]:
import torch
t = torch.tensor([enc.encode(promt)])
t

tensor([[15496,    11,   314,  1101,   257,  3303,  2746,    11]])

In [16]:
enc.decode(t[0].to_list())

AttributeError: 'Tensor' object has no attribute 'to_list'

In [20]:
list(t[0].tolist())

[15496, 11, 314, 1101, 257, 3303, 2746, 11]

In [21]:
a = torch.randn((3,4,5))
a.shape

torch.Size([3, 4, 5])

In [22]:
a[:,-1,:]

tensor([[-0.0631,  0.3057, -0.7746,  0.0349,  0.3211],
        [-0.9224,  1.8113,  0.1606,  0.3672,  0.1754],
        [-0.8278,  1.3347,  0.4835, -0.1976,  1.2683]])