In [12]:
import json

with open('data/greek_words.json', 'r') as file:
    greek_words = json.load(file)

In [13]:
len(greek_words)

290

In [10]:
len({e['Форма'] for e in greek_words })

forms = []

unique_entries = []
for entry in greek_words:
    if entry['Форма'] not in forms:
        forms.append(entry['Форма'])
        unique_entries.append(entry)
len(unique_entries)

290

In [30]:
with open('data/greek_words.json', 'w') as file:
    json.dump(unique_entries, file, ensure_ascii=False, indent=4)

In [31]:
lemas = []
lema_entries = []
for entry in greek_words:
    if entry['Лема'] not in lemas:
        lemas.append(entry['Лема'])
        lema_entries.append({'Лема': entry['Лема'], 'Превод': entry['Превод']})
len(lema_entries)

219

In [36]:
set([entry['Лема'] for entry in lema_entries1]) - set([entry['Лема'] for entry in lema_entries])

set()

In [37]:
with open('data/greek_words_standard.json', 'r') as file:
    lema_entries1 = json.load(file)

In [40]:
for entry in lema_entries1:
    if entry['Лема'] in dct_corrected:
        entry['Превод'] = dct_corrected[entry['Лема']]

In [42]:
with open('data/greek_words_standard.json', 'w') as file:
    json.dump(lema_entries1, file, ensure_ascii=False, indent=4)

In [39]:
dct_corrected = {e["Лема"]: e["Превод"] for e in corrected}

In [38]:
corrected = [
{
  "Лема": "παῖς",
  "Превод": "дете"
},
{
  "Лема": "λούω",
  "Превод": "къпя"
},
{
  "Лема": "ἀποπνίγω",
  "Превод": "удавям"
},
{
  "Лема": "αὐτός",
  "Превод": "самият; той"
},
{
  "Лема": "ἐγώ",
  "Превод": "аз"
},
{
  "Лема": "ὕστερος",
  "Превод": "по-късен"
},
{
  "Лема": "ἀπότομος",
  "Превод": "стръмен"
},
{
  "Лема": "πύλη",
  "Превод": "порта"
},
{
  "Лема": "πᾶς",
  "Превод": "всеки"
},
{
  "Лема": "τίς",
  "Превод": "кой"
},
{
  "Лема": "τετράπους",
  "Превод": "четириног"
},
{
  "Лема": "δίπους",
  "Превод": "двуног"
},
{
  "Лема": "τρίπους",
  "Превод": "трикрак"
},
{
  "Лема": "ὅμοιος",
  "Превод": "подобен"
},
{
  "Лема": "ἴσος",
  "Превод": "равен"
},
{
  "Лема": "ἑσπέριος",
  "Превод": "западен"
},
{
  "Лема": "Ἠλεῖος",
  "Превод": "елейски  (жител на Елис)"
},
{
  "Лема": "Μεσσήνιος",
  "Превод": "месенски (жител на Месения)"
},
{
  "Лема": "Ἀχαιός",
  "Превод": "ахеец"
},
{
  "Лема": "ὀργίζομαι",
  "Превод": "гневя се"
}
]

In [17]:
JSONL_PATH = 'data/kaikki.org-dictionary-AncientGreek-words.jsonl'

In [18]:
import json, unicodedata
from collections import defaultdict

# JSONL_PATH = "ancient_greek.jsonl"  # put your Kaikki JSONL file here

def strip_diacritics(s: str) -> str:
    # loose matching: removes accents/combining marks but keeps rough/smooth breathing as letters
    return "".join(ch for ch in unicodedata.normalize("NFD", s)
                   if unicodedata.category(ch) != "Mn")

def load_entries(jsonl_path=JSONL_PATH, lang_code="grc"):
    by_word = defaultdict(list)
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip(): 
                continue
            obj = json.loads(line)
            if obj.get("lang_code") != lang_code:
                continue
            w = obj.get("word")
            if not w:
                continue
            by_word[w].append(obj)
            # also index a stripped version for accent-insensitive search
            by_word[strip_diacritics(w)].append(obj)
    return by_word

def get_entries(by_word, lemma: str, pos: str|None=None):
    key_exact = lemma
    key_loose = strip_diacritics(lemma)
    candidates = []
    for k in (key_exact, key_loose):
        if k in by_word:
            for e in by_word[k]:
                if pos is None or e.get("pos") == pos:
                    candidates.append(e)
    # dedupe by (id-ish) tuple
    seen, out = set(), []
    for e in candidates:
        tid = (e.get("word"), e.get("pos"), e.get("etymology_number"))
        if tid not in seen:
            seen.add(tid); out.append(e)
    return out

def list_glosses(entry):
    glosses = []
    for s in entry.get("senses", []):
        if "glosses" in s:
            glosses.extend(s["glosses"])
        elif "raw_glosses" in s:
            glosses.extend(s["raw_glosses"])
    return glosses

def filter_forms(entry, *, case=None, number=None, gender=None, tags_all=None):
    res = []
    for f in entry.get("forms", []):
        tags = set(f.get("tags", []))
        if case and case not in tags: continue
        if number and number not in tags: continue
        if gender and gender not in tags: continue
        if tags_all and not set(tags_all).issubset(tags): continue
        res.append(f)
    return res

def ipa_by_period(entry):
    out = []
    for s in entry.get("sounds", []):
        ipa = s.get("ipa")
        note = s.get("note")
        if ipa:
            out.append((note or "unspecified", ipa))
    return out

# --- example usage ---
by_word = load_entries(JSONL_PATH)

# 1) κύων (noun) — dog
k_entries = get_entries(by_word, "κύων", pos="noun")
for e in k_entries:
    print("WORD:", e["word"], "| POS:", e["pos"])
    print("Glosses:", list_glosses(e)[:5])
    print("Nom.Sg forms:", [f["form"] for f in filter_forms(e, case="nominative", number="singular")])
    print("Gen.Sg forms:", [f["form"] for f in filter_forms(e, case="genitive", number="singular")])
    print("IPA:", ipa_by_period(e))
    print("-"*40)

# 2) κύων (verb-form) — participle of κύω
kv_entries = get_entries(by_word, "κύων", pos="verb")
for e in kv_entries:
    print("WORD:", e["word"], "| POS:", e["pos"])
    print("First 3 glosses:", list_glosses(e)[:3])
    print("Some forms (masc nom sg):", [f["form"] for f in filter_forms(e, gender="masculine", case="nominative", number="singular")])
    print("-"*40)

# 3) σκύλος (neuter) — skin/hide (3rd decl.)
s_entries = get_entries(by_word, "σκύλος", pos="noun")
for e in s_entries:
    print("WORD:", e["word"], "| POS:", e["pos"])
    print("Glosses:", list_glosses(e))
    print("Nom.Pl:", [f["form"] for f in filter_forms(e, case="nominative", number="plural")])
    print("-"*40)

# 4) Proper name: Κύπρος
c_entries = get_entries(by_word, "Κύπρος", pos="name")
for e in c_entries:
    print("WORD:", e["word"], "| POS:", e["pos"])
    print("Glosses:", list_glosses(e))
    print("Cases (singular):",
          {lab: [f["form"] for f in filter_forms(e, case=lab, number="singular")]
           for lab in ("nominative","genitive","dative","accusative")})
    print("-"*40)

WORD: κύων | POS: noun
Glosses: ['a dog', 'a bitch', 'a bitch (used of women, to denote shamelessness or audacity)', 'an offensive person']
Nom.Sg forms: ['ἡ κῠ́ων']
Gen.Sg forms: ['τῆς κῠνός']
IPA: [('unspecified', '/ký.ɔːn/'), ('unspecified', '/ˈcy.on/'), ('unspecified', '/ˈci.on/'), ('5ᵗʰ BCE Attic', '/ký.ɔːn/'), ('1ˢᵗ CE Egyptian', '/ˈky.on/'), ('4ᵗʰ CE Koine', '/ˈcy.on/'), ('10ᵗʰ CE Byzantine', '/ˈcy.on/'), ('15ᵗʰ CE Constantinopolitan', '/ˈci.on/')]
----------------------------------------
WORD: κύων | POS: verb
First 3 glosses: ['present active participle of κύω (kúō)']
Some forms (masc nom sg): ['κῠ́ων']
----------------------------------------
WORD: σκύλος | POS: noun
Glosses: ['skin, hide']
Nom.Pl: ['σκῠ́λη', 'σκῠ́λεᾰ']
----------------------------------------
WORD: Κύπρος | POS: name
Glosses: ['Cyprus (an island in the Mediterranean Sea)']
Cases (singular): {'nominative': ['Κῠ́προς'], 'genitive': ['Κῠ́πρου'], 'dative': ['Κῠ́πρῳ'], 'accusative': ['Κῠ́προν']}
-----------------

In [27]:
k_entries = get_entries(by_word, "ἕως")
for e in k_entries:
    print("WORD:", e["word"], "| POS:", e["pos"])
    print("Glosses:", list_glosses(e)[:5])
    print("Nom.Sg forms:", [f["form"] for f in filter_forms(e, case="nominative", number="singular")])
    print("Gen.Sg forms:", [f["form"] for f in filter_forms(e, case="genitive", number="singular")])
    print("IPA:", ipa_by_period(e))
    print("-"*40)

WORD: ἕως | POS: noun
Glosses: ['Attic form of ἠώς (ēṓs)']
Nom.Sg forms: ['ἕως']
Gen.Sg forms: ['ἕω']
IPA: [('unspecified', '/hé.ɔːs/'), ('unspecified', '/ˈe.os/'), ('unspecified', '/ˈe.os/'), ('5ᵗʰ BCE Attic', '/hé.ɔːs/'), ('1ˢᵗ CE Egyptian', '/ˈ(h)e.os/'), ('4ᵗʰ CE Koine', '/ˈe.os/'), ('10ᵗʰ CE Byzantine', '/ˈe.os/'), ('15ᵗʰ CE Constantinopolitan', '/ˈe.os/')]
----------------------------------------
WORD: ἕως | POS: conj
Glosses: ['until, till', 'of a fact in past time', 'until, till', 'until, till', 'until, till']
Nom.Sg forms: []
Gen.Sg forms: []
IPA: [('unspecified', '/hé.ɔːs/'), ('unspecified', '/ˈe.os/'), ('unspecified', '/ˈe.os/'), ('5ᵗʰ BCE Attic', '/hé.ɔːs/'), ('1ˢᵗ CE Egyptian', '/ˈ(h)e.os/'), ('4ᵗʰ CE Koine', '/ˈe.os/'), ('10ᵗʰ CE Byzantine', '/ˈe.os/'), ('15ᵗʰ CE Constantinopolitan', '/ˈe.os/')]
----------------------------------------
WORD: ἕως | POS: adv
Glosses: ['for a time, like τέως (téōs)']
Nom.Sg forms: []
Gen.Sg forms: []
IPA: [('unspecified', '/hé.ɔːs/'), ('unsp