In [1]:
from butter_fingers import _butter_finger
import numpy as np
from openai import OpenAI
from multiprocessing.dummy import Pool as ThreadPool
import elasticsearch
from pydantic import BaseModel

from dotenv import load_dotenv
load_dotenv("../.env")

True

In [2]:
class TyposResponse(BaseModel):
    text_with_typos: str
    
PROMPT = "Wprowadź do tekstu jakieś częste literówki: `{text}`"

In [3]:
def introduce_typos(text):
    client = OpenAI()
    response = client.responses.parse(
        input=PROMPT.format(text=text),
        model="gpt-4o-mini",
        temperature=0.0,
        text_format=TyposResponse,
    )
    return response.output_parsed.text_with_typos

def butter_finger(text):
    return _butter_finger(text, prob=0.1, keyboard="qwerty", seed=0, max_outputs=1)[0]

In [4]:
print("GPT-typos:", introduce_typos("Dokąd nocą tupta jeż?"))
print("Butter-fingers:", butter_finger("Dokąd nocą tupta jeż?"))

GPT-typos: Dokąd nocą tupta jeż?
Butter-fingers: Dokąf nocą tupta jeż?


In [5]:
es = elasticsearch.Elasticsearch(hosts="http://localhost:9200")

def sample_documents(index):
    res = es.search(index=index, body={"size": 20, "query": {"function_score": {"random_score": {}}}})
    return res['hits']['hits']

In [18]:
import difflib
import re

def find_typo_corrections(original: str, with_typos: str) -> list[tuple[str, str]]:
    """Znajduje różnice między tekstem z literówkami a oryginałem - zwraca całe słowa."""
    corrections = []
    
    # Dzielimy na słowa
    original_words = original.split()
    typos_words = with_typos.split()
    
    # Używamy SequenceMatcher do znalezienia różnic na poziomie słów
    matcher = difflib.SequenceMatcher(None, typos_words, original_words)
    
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'replace':
            # Słowa zostały zamienione
            typo_phrase = " ".join(typos_words[i1:i2])
            correct_phrase = " ".join(original_words[j1:j2])
            if typo_phrase and correct_phrase:
                corrections.append((typo_phrase, correct_phrase))
        elif tag == 'delete':
            # Słowa zostały usunięte w with_typos (były dodatkowe)
            typo_phrase = " ".join(typos_words[i1:i2])
            if typo_phrase:
                corrections.append((typo_phrase, ""))
        elif tag == 'insert':
            # Słowa zostały dodane w oryginale (brakowały w with_typos)
            correct_phrase = " ".join(original_words[j1:j2])
            if correct_phrase:
                corrections.append(("", correct_phrase))
    
    return corrections


def format_thinking_answer(original: str, with_typos: str) -> str:
    """
    Formatuje dane w stylu Deepseek-R1:
    <think>
    x -> y
    a -> b
    </think>
    <answer>poprawiony tekst</answer>
    """
    corrections = find_typo_corrections(original, with_typos)
    
    # Budujemy sekcję <think>
    think_lines = []
    for typo, correct in corrections:
        if typo and correct:
            think_lines.append(f"{typo} -> {correct}")
        elif typo:
            think_lines.append(f"usuń: {typo}")
        elif correct:
            think_lines.append(f"dodaj: {correct}")
    
    think_section = "<think>\\n" + "\\n".join(think_lines) + "\\n</think>"
    answer_section = f"<answer>{original}</answer>"
    
    return f"{think_section}\\n{answer_section}"


def prepare_doc_thinking(doc):
    """Przygotowuje dokument w formacie SFT z thinking."""
    text = doc['_source']['text']
    random_ceil = min(1000, len(text))
    random_begin = np.random.randint(0, len(text) - random_ceil + 1)
    original_passage = text[random_begin:random_begin + random_ceil]

    # Wprowadzamy literówki
    if np.random.rand() <= 0.5:
        passage_with_typos = butter_finger(original_passage)
    else:
        passage_with_typos = introduce_typos(original_passage)

    # Formatujemy w stylu thinking/answer
    formatted_output = format_thinking_answer(original_passage, passage_with_typos)
    
    return passage_with_typos, formatted_output


In [10]:
# Test nowej funkcji formatującej
original = "Dokąd nocą tupta jeż?"
with_typos = "Dokąf nocą tupta jeż?"

result = format_thinking_answer(original, with_typos)
print(result)
print("\n" + "="*50 + "\n")

# Test z dłuższym tekstem
original2 = "To jest przykładowy tekst z wieloma słowami."
with_typos2 = "To jst przykładowy tekzt z wielloma słowami."
result2 = format_thinking_answer(original2, with_typos2)
print(result2)


<think>
Dokąf -> Dokąd
</think>
<answer>Dokąd nocą tupta jeż?</answer>


<think>
jst -> jest
tekzt -> tekst
wielloma -> wieloma
</think>
<answer>To jest przykładowy tekst z wieloma słowami.</answer>


In [13]:
def prepare_doc(doc):
    text = doc['_source']['text']
    random_ceil = min(1000, len(text))
    random_begin = np.random.randint(0, len(text) - random_ceil+1)
    passage = text[random_begin:random_begin+random_ceil]
    
    # Remove first and last words, as they might be incomplete
    passage = passage.split(" ")[1:-1]
    passage = " ".join(passage)

    if np.random.rand() <= 0.5:
        transformed = butter_finger(passage)
    else:
        transformed = introduce_typos(passage)

    return passage, transformed

In [14]:
from threading import Thread, Lock
import tqdm
import csv

lock = Lock()

def prepare_dataset(dname="dataset_train.csv", prepare_func=prepare_doc):
    def _inner(doc):
        try:
            passage, transformed = prepare_func(doc)

            with lock:
                with open(dname, "a") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        passage.replace("\n", " "),
                        transformed.replace("\n", " "),
                    ])
        except Exception as e:
            print(e)

    threads = []
    documents_judiciary = sample_documents("orzeczenia_03_2024_search-giga-index")
    documents_eurlex = sample_documents("eurlex_03_2025_search-giga-index")
    documents_acts = sample_documents("ustawy_06_2025_search-giga-index")
    documents_eureka = sample_documents("eureka_03_2024_search-giga-index")
    docuements_codexes = sample_documents("kodeksy_06_2025_search-giga-index")
    documents_kio = sample_documents("kio_exam_07_2025_search")
    documents = documents_judiciary + documents_eurlex + documents_acts + documents_eureka + docuements_codexes + documents_kio
    for doc in documents:
        t = Thread(target=_inner, args=(doc,))
        t.start()
        threads.append(t)
    
    for t in threads:
        t.join()

In [9]:
for i in tqdm.trange(100):
    prepare_dataset("dataset_train.csv")

 20%|██        | 20/100 [10:48<2:24:05, 108.07s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 45528 [type=json_invalid, input_value='{"text_with_typos":"sied...awczej do Sądu Okręg', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 32%|███▏      | 32/100 [17:39<1:38:15, 86.69s/it] 

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 65345 [type=json_invalid, input_value='{"text_with_typos":"h po...stant to=TyposResponse ', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 34%|███▍      | 34/100 [22:48<2:27:28, 134.07s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 46016 [type=json_invalid, input_value='{"text_with_typos":"g UZ...egradowalne patyczki do', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 48%|████▊     | 48/100 [30:47<1:16:12, 87.93s/it] 

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 44658 [type=json_invalid, input_value='{"text_with_typos":"eina... Sprawiedliwości, art.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 50%|█████     | 50/100 [37:02<2:08:18, 153.96s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 49305 [type=json_invalid, input_value='{"text_with_typos":" - I...ku towarowego-Względne', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 81%|████████  | 81/100 [50:13<33:07, 104.59s/it]  

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 53074 [type=json_invalid, input_value='{"text_with_typos":"zest... jakichkolwiek objawach', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 84%|████████▍ | 84/100 [56:00<36:24, 136.54s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 25021 [type=json_invalid, input_value='{"text_with_typos":"rze ...}  }  }  }  }  }  }  } ', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 85%|████████▌ | 85/100 [56:21<25:27, 101.83s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 672 [type=json_invalid, input_value='{"text_with_typos":"ww. ...i jednoznaczny ustalone', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 86%|████████▌ | 86/100 [1:00:59<36:03, 154.52s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 45269 [type=json_invalid, input_value='{"text_with_typos":"ntru...o groszy) stanowiącej ', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 93%|█████████▎| 93/100 [1:08:45<13:59, 119.88s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 25133 [type=json_invalid, input_value='{"text_with_typos":"\\" ... }  }  }  }  }  }  }  }', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 96%|█████████▌| 96/100 [1:14:17<09:08, 137.12s/it]

1 validation error for TyposResponse
  Invalid JSON: EOF while parsing a string at line 1 column 45152 [type=json_invalid, input_value='{"text_with_typos":"ego ... orzecznik ZUS, członk', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


100%|██████████| 100/100 [1:15:16<00:00, 45.17s/it]


In [8]:
for i in tqdm.trange(10):
    prepare_dataset("dataset_test.csv")

 70%|███████   | 7/10 [01:55<00:49, 16.52s/it]


KeyboardInterrupt: 

## GRPO

In [None]:
for i in tqdm.trange(50):
    prepare_dataset("dataset_train_grpo_sft.csv", prepare_func=prepare_doc_thinking)

  0%|          | 0/50 [00:00<?, ?it/s]

In [22]:
for i in tqdm.trange(8):
    prepare_dataset("dataset_test_grpo_sft.csv", prepare_func=prepare_doc_thinking)

100%|██████████| 8/8 [02:25<00:00, 18.20s/it]
