In [11]:
import os
from llama_index.llms.ollama import Ollama
import json
from dataclasses import dataclass, astuple, fields
from db import DB, Corpus, Doc, Line, Form, Predict, Lemma, LemmaRaw
from pandas import DataFrame as df

In [2]:
@dataclass
class Model:
  llama_1b   : str = "llama3.2:1b"
  llama_3b   : str = "llama3.2:3b"
  llama_8b   : str = "llama3.1:8b"
  llama_70b  : str = "llama3.3:10b"
  mistral_7b : str = "mistral:7b"
  mistral_12b: str = "mistral-nemo:12b"
  mistral_22b: str = "mistral-small:22b"

print(astuple(Model()))
print([f.default for f in fields(Model)])

('llama3.2:1b', 'llama3.2:3b', 'llama3.1:8b', 'llama3.3:10b', 'mistral:7b', 'mistral-nemo:12b', 'mistral-small:22b')
['llama3.2:1b', 'llama3.2:3b', 'llama3.1:8b', 'llama3.3:10b', 'mistral:7b', 'mistral-nemo:12b', 'mistral-small:22b']


In [None]:
class LLM:
  temperature = 0.0
  request_timeout = 3600.0
  query = "Сколько букв в слове привет? Выведи каждую букву отдельно."
  text = (
    # 'Mæg gehyran'
    'Mæg gehyran se ðe wyle be þam halgan mædene Eugenian Philyppus dæhter hu heo ðurh mægðhad mærlice þeah and þurh martyrdom þisne middaneard oferswað\n'
    'Sum æþelboren þægn wæs Philippus gehaten ðone asende se casere Commodus þe on ðam dagum rixode fram Rome.byrig to ðære byrig ðe is gehaten Alexandria'
  )
  path_example = 'd/llm/examples'

  def __init__(self, path=path_example, **kwargs):
    defaults = {
        "model": Model.mistral_7b,
        "temperature": 0.0,
        "request_timeout": 3600.0
    }
    self.llm = Ollama(**{**defaults, **kwargs})
    self.path = f'{path}/{self.llm.model}'
    if not os.path.exists(self.path):
      os.makedirs(self.path)

  def complete(self, query=query):
    self.response = self.llm.complete(query)
    with open(f'{self.path}/raw.json', 'w') as f:
      f.write(self.response.model_dump_json(indent=2))
    return self.response

  def debug(self):
    print(self.response.text)
    print()
    print(self.response.raw)

  def lemmatize_old_english(self, text=text):
      prompt = f"""
      Perform lemmatization of the following Old English text:
      {text}

      Return the result as a JSON array where each item contains:
      - word_form: the original word form
      - lemma: the lemma of the word
      - translation_en: the English translation of the lemma
      - translation_ru: the Russian translation of the lemma
      - morph_analysis: morphological analysis
      - syntax_analysis: syntactic analysis
      The result should be just json without formatting and text descriptions.
      """
      self.json = json.loads(self.complete(prompt).text)
      with open(f'{self.path}/data.json', 'w') as f:
        json.dump(self.json, f, indent=2, ensure_ascii=False)
      return self.json
  
llm = LLM()
# llm.complete()
# llm.debug()
llm.lemmatize_old_english()

[{'word_form': 'Mæg',
  'lemma': 'magan',
  'translation_en': 'can, may',
  'translation_ru': 'может, може',
  'morph_analysis': 'pronoun, 3rd person singular present subjunctive of magan',
  'syntax_analysis': 'subject'},
 {'word_form': 'gehyran',
  'lemma': 'gehyran',
  'translation_en': 'hear, listen',
  'translation_ru': 'слушать',
  'morph_analysis': 'verb, 3rd person singular present indicative of gehyran',
  'syntax_analysis': 'object'}]

In [4]:
class LLM_Stream(LLM):
  def stream_complete(self, query = LLM.query):
    for response in self.llm.stream_complete(query):
      print(response, end="\r")
    self.response = response
    return response

llm_stream = LLM_Stream()
llm_stream.stream_complete("Сколько букв в слове привет?")
# llm_stream.stream_complete("Напиши длинное стихотворение")
llm_stream.debug()

 В слове "привет" - пять букв.

{'model': 'mistral:7b', 'created_at': '2025-01-10T04:44:14.380616Z', 'done': True, 'done_reason': 'stop', 'total_duration': 453208458, 'load_duration': 3639416, 'prompt_eval_count': 17, 'prompt_eval_duration': 169000000, 'eval_count': 16, 'eval_duration': 279000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None), 'usage': {'prompt_tokens': 17, 'completion_tokens': 16, 'total_tokens': 33}}


In [5]:
db = DB()
# db.init()
df(db.get_doc())

Unnamed: 0,id,doc,corpus
0,1,æls.conll,iswoc
1,2,apt.conll,iswoc
2,3,chrona.conll,iswoc
3,4,or.conll,iswoc
4,5,wscp.conll,iswoc


In [10]:
db.df(Line, 2)

Unnamed: 0,id,doc_id,num,line,lemmas
0,1,1,0,Mæg gehyran se ðe wyle be þam halgan mædene Eu...,mag gehyran se þe willan be se halig mægden Eu...
1,2,1,1,Sum æþelboren þægn wæs Philippus gehaten ðone ...,sum æþelboren þegen wesan Philippus gehatan se...
