In [16]:
from typing import List
from llama_cpp import Llama

In [17]:
class Explainer:
    generation_kwargs = {
        "max_tokens": 500,  # Max number of new tokens to generate
        "stop": ["<eos>"],  # Text sequences to stop generation on
        "echo": False,  # Echo the prompt in the output
        "top_k": 1
    }

    model_kwargs = {
        "n_ctx": 512,  # Context length to use
        "n_threads": 4,  # Number of CPU threads to use
        "n_gpu_layers": 0,  # Number of model layers to offload to GPU. Set to 0 if only using CPU
    }

    llama_prompt = """
    ### Input:
    {}

    ### Response:
    {}"""

    def __init__(self, repo_id: str, model_file_name: str) -> None:
        self.llm = Llama.from_pretrained(
            repo_id=repo_id,
            filename=model_file_name,
            verbose=True,
            model_type="llama"
        )

    @staticmethod
    def __create_text_message(messages: List[str]) -> str:
        """
        Concatenates text messages in string-list

        :param messages: list of string messages
        :return: str concatenated messages string
        """
        message_text = ""
        for message in messages:
            message_text += f"- {message}\n"

        return message_text

    def predict(self, messages: List[str]) -> str:
        messages_text = self.__create_text_message(messages)
        prompt = self.llama_prompt.format(messages_text, '')
        summary = self.llm(prompt, **self.generation_kwargs)
        return summary["choices"][0]['text'].strip()


In [18]:
messages = ['дополнение к кате', 
'поиграть', 
'я могу не надолго отойти?', 
'на компьютере стикеры странные почему так', 
'А у меня интернет сегодня прям в ударе', 
'можно в крокодила поиграть', 
'посмотрите что за стикер', 
'реально, окантовка слишком большая', 
'на телефоне они какие-то аккуратненькие, а тут тяп ляп какой то', 
'корм', 
'эмодзи', 
'вор', 
'Человек', 
'пухляшь', 
'лунтик', 
'человек пьет кофе', 
'шлепа' 
]

In [20]:
explain = Explainer("gromoboy/gemma_gguf_v2", "gemma_gguf_v2-unsloth.Q4_K_M.gguf")

./gemma_gguf_v2-unsloth.Q4_K_M.gguf:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 27 key-value pairs and 164 tensors from /root/.cache/huggingface/hub/models--gromoboy--gemma_gguf_v2/snapshots/f300c3770aec5f164b0f405e65ae2ff6038e3ab9/./gemma_gguf_v2-unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = gemma_gguf_v2
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          gemma.block_count u32              = 18
llama_model_loader: - kv   5:                  gemma.feed_forward_length u32              = 16384
llama_model_loader: - kv   6:                 gemma.at

In [21]:
%%time
explain.predict(messages)


llama_print_timings:        load time =    2617.88 ms
llama_print_timings:      sample time =     201.55 ms /    53 runs   (    3.80 ms per token,   262.96 tokens per second)
llama_print_timings: prompt eval time =    2617.53 ms /   149 tokens (   17.57 ms per token,    56.92 tokens per second)
llama_print_timings:        eval time =    2489.97 ms /    52 runs   (   47.88 ms per token,    20.88 tokens per second)
llama_print_timings:       total time =    7482.32 ms /   201 tokens


CPU times: user 2min 44s, sys: 15.9 s, total: 2min 59s
Wall time: 7.49 s


'Негативные аспекты: технические неполадки, проблемы с интернет-соединением. Положительные аспекты: возможность поиграть в компьютерные игры и в крокодилов, интерес к стикерам, возможность накормить животных.'

In [2]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.65.tar.gz (38.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.65-cp310-cp310-linux_x86_64.whl size=3329810 sha256=241dc7