In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
import random


model_name = 'eryk-mazus/polka-1.1b'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [2]:
 tokenizer.get_vocab().items()



In [3]:
import re

def find_relevant_suffix(s):
    # Split by any whitespace or punctuation symbol
    split_items = re.split(r'\W+', s)
    # Filter out empty strings from the split result
    split_items = [item for item in split_items if item]
    
    # Get the last item in the list
    last_item = split_items[-1]
    # Find the index where this last item starts in the original string
    last_index = s.rfind(last_item)
    return s[last_index:]


In [4]:
from transformers import LogitsProcessor

def split_string_by_punctuation_and_whitespace(s: str) -> list:
    return re.split(r'[\s\W]+', s)

class StrictStartingLetterProcessor(LogitsProcessor):
    def __call__(self, input_ids, scores):
        generated_sentence = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        prefix = generated_sentence[0]
        generated_suffix = find_relevant_suffix(generated_sentence)
        generated_suffix_ids = tokenizer.encode(generated_suffix)

        for token_id in range(scores.size(1)):
            token = tokenizer.convert_ids_to_tokens(token_id)
            if not token:
                scores[0, token_id] = -float("inf")
                continue

            # sentence = tokenizer.decode([*generated_suffix_ids, token_id], skip_special_tokens=True)
            split = split_string_by_punctuation_and_whitespace(generated_suffix + token)
            if not all(string.startswith(prefix) or not string for string in split):
                scores[0, token_id] = -float("inf")
        return scores

def generate_sentence_with_custom_logits(prefix, max_length, top_k, top_p, temperature):
    input_ids = tokenizer(prefix.lower(), return_tensors="pt")['input_ids'].to(device)
    
    # Dodanie CustomLogitsProcessor do generacji
    logits_processor = [StrictStartingLetterProcessor()]
    
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=1.8,
        do_sample=True,
        logits_processor=logits_processor,  # Dodajemy modyfikator logitów
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.capitalize() + "."

def generate_best_sentence_with_filter(prefix, max_length, top_k, top_p, temperature, n_variants=2):
    candidates = [
        generate_sentence_with_custom_logits(prefix, max_length, top_k, top_p, temperature)
        for _ in range(n_variants)
    ]
    
    # Wybór najlepszego zdania, np. najdłuższego lub najbardziej spójnego
    best_sentence = max(candidates, key=len)
    return best_sentence


In [12]:
generate_best_sentence_with_filter(
	top_k = 5,
	top_p = 0.9,
	temperature = 0.9,
	max_length = 30,
	prefix = "Nie należy natomiast"
)

'Nie należy natomiast nadmiernie naciskać na nogi, np. nawet nieco "nacierając" nią.'

In [6]:
generate_best_sentence_with_filter(
	top_k = 5,
	top_p = 0.9,
	temperature = 0.9,
	max_length = 30,
	prefix = "Coraz częściej cukiernie"
)

'Coraz częściej cukiernie, ciasta , ciastka czy chałwy. - ciekawostki, .'

In [7]:
generate_best_sentence_with_filter(
	top_k = 5,
	top_p = 0.9,
	temperature = 0.9,
	max_length = 30,
	prefix = "Ponieważ powiedział pan,"
)

'Ponieważ powiedział pan, „przecież pana przyjaciółka” – panią profesor, przypominała,.'

In [8]:
generate_best_sentence_with_filter(
	top_k = 5,
	top_p = 0.9,
	temperature = 0.9,
	max_length = 30,
	prefix = "Dlaczego dopuścił do"
)

'Dlaczego dopuścił do dzisiejszych dziejów, �darymskie - ��dzkie. [....'

In [9]:
generate_best_sentence_with_filter(
	top_k = 5,
	top_p = 0.9,
	temperature = 0.9,
	max_length = 30,
	prefix = "Cudo, cudo, cudo,"
)

"Cudo, cudo, cudo, czyli coś, co czuję | ♥ ~ ~ ~ - ★ ~~\\'~."