In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers[torch]
!pip install einops

In [None]:
import torch
import transformers
import matplotlib.pyplot as plt
import numpy as np

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained('tiiuae/falcon-rw-1b',
  trust_remote_code=True,
  torch_dtype=torch.bfloat16,
)
model.eval()
model.to("cuda")

In [None]:
model_size = sum([t.numel() for t in model.parameters()])
print(f"Modelsize: {model_size/1_000**2:.2f} M parameters")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('tiiuae/falcon-rw-1b')

In [None]:
text = "My hobby is".strip()

*Encode* text to tokens

In [None]:
tokenized_text = tokenizer(text, return_tensors='pt')
tokenized_text['input_ids']

Decode tokens to text

In [None]:
tokenizer.batch_decode(tokenized_text['input_ids'])

predict next word

In [None]:
# take the top 10 next word probabilities
logits = model(**tokenized_text.to('cuda')).logits[0][-1,:].detach().cpu().to(torch.float32)
top10 = torch.nn.functional.softmax(logits).topk(10)
words = tokenizer.batch_decode(top10.indices)
probabilities = top10.values * 100

In [None]:
plt.style.use('seaborn')
plt.figure()
plt.barh(np.arange(0, 10), probabilities)
plt.title("top 10 most probable words")
plt.yticks(np.arange(0,10), words)
plt.xlabel("[%]")
plt.show()

In [None]:
outputs = model.generate(**tokenized_text.to('cuda'), max_new_tokens=64, top_k=35, top_p=0.95, do_sample=True)

outputs are tokens so we have to decode them

In [None]:
outputs

In [None]:
print("".join(tokenizer.batch_decode(outputs[0])))

before training set only small amount of parameters to be trainable

In [None]:
for name, param in model.named_parameters():
    print(f"{name}   Size: {param.numel()/1000**2:.1f}M parameters")
    if '23' not in name:
      param.requires_grad = False
    print(name, param.requires_grad)

In [None]:
trainable_params = 0
for name, param in model.named_parameters():
    if '23' in name:
      trainable_params += param.numel()

print(f" Trainable params: {trainable_params/1000**2:.1f}M parameters")


perpare "training set"

In [None]:
train_set = ["I read all kinds of books. I really like crime stories and sci-fi novels.",
             "I read books all the time.",
             "Reading books is my favourite activity."]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenized_train_set = tokenizer(train_set, return_tensors='pt', padding=True)

In [None]:
model.train()

In [None]:
lossf = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
outputs = model(**tokenized_train_set.to("cuda"))

In [None]:
logits = outputs.logits
B, T, C = logits.shape
logits = logits.view(B*T, C)

targets = tokenized_train_set['input_ids'].roll(-1)
targets[:,-1] = tokenizer.pad_token_id
targets = targets.view(B*T)

loss = lossf(logits, targets)
loss.item()

In [None]:
loss.backward()
optimizer.step()
optimizer.zero_grad()

In [None]:
outputs = model(**tokenized_train_set.to("cuda"))

In [None]:
logits = outputs.logits
B, T, C = logits.shape
logits = logits.view(B*T, C)

targets = tokenized_train_set['input_ids'].roll(-1)
targets[:,-1] = tokenizer.pad_token_id
targets = targets.view(B*T)

loss = lossf(logits, targets)
loss.item()

In [None]:
model.eval()

In [None]:
logits = model(**tokenized_text.to('cuda')).logits[0][-1,:].detach().cpu().to(torch.float32)
top10 = torch.nn.functional.softmax(logits).topk(10)
words = tokenizer.batch_decode(top10.indices)
probabilities = top10.values * 100

In [None]:
plt.style.use('seaborn')
plt.figure()
plt.barh(np.arange(0, 10), probabilities)
plt.title("top 10 most probable words")
plt.yticks(np.arange(0,10), words)
plt.xlabel("[%]")
plt.show()

In [None]:
outputs = model.generate(**tokenized_text.to('cuda'), max_new_tokens=64, top_k=35, top_p=0.95, do_sample=True)

In [None]:
tokenizer.batch_decode(outputs)