## 1.0 Imports

In [2]:
import gzip
import json
from pathlib import Path
from pprint import pprint
from random import choice

## 2.0 Define Paths

In [3]:
# Path for preproceed Wikitionary
wiktionary_dir = Path("../../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [4]:
# Path to save words status
output_dir = Path("../../data/interim/")

# 3.0 Load Wikitionary Files

In [5]:
# 2. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-872

Reading line-128636

In [6]:
# 3. load Indices
with gzip.open(wiki_indices_path, "rt", encoding="utf-8") as gzip_obj:
    wiki_idxs = json.load(gzip_obj)

# 4.0 View Dictionaries

## 4.1 Search for Arabic Word in Wiktionary

In [34]:
ar_word = "عصا"

In [35]:
if idxs := wiki_idxs.get(ar_word):
    for idx in idxs:
        print(idx)
        pprint(wikitionary[idx])
        print()

12009
{'عصا': {'categories': ['Arabic 2-syllable words',
                        'Arabic feminine nouns',
                        'Arabic final-weak form-I verbs',
                        'Arabic final-weak verbs',
                        'Arabic final-weak verbs by conjugation',
                        'Arabic form-I verbs',
                        'Arabic form-I verbs with و as third radical',
                        'Arabic lemmas',
                        'Arabic nouns',
                        'Arabic terms with IPA pronunciation',
                        'Arabic terms with homophones',
                        'Arabic verbs',
                        'Terms with redundant transliterations',
                        'Terms with redundant transliterations/ar'],
         'descendants': [{'depth': 1,
                          'templates': [{'args': {'1': 'mt', '2': 'għasa'},
                                         'expansion': 'Maltese: għasa',
                                         

## 4.2 View Random Dictionary

View dictionaries to check where to find the information

In [22]:
# show sample example
random_word = choice(list(wiki_idxs.keys()))
random_word_idxs = wiki_idxs[random_word]
print("word, idx:", random_word, random_word_idxs)

word, idx: تجهدون [81531, 81532, 81533, 81534]


In [28]:
pprint(wikitionary[choice(random_word_idxs)])

{'تجهدون': {'categories': ['Arabic non-lemma forms', 'Arabic verb forms'],
            'etymology_number': 1,
            'etymology_templates': [],
            'etymology_text': '',
            'forms': [{'form': 'تُجْهَدُونَ', 'tags': ['canonical']},
                      {'form': 'tujhadūna', 'tags': ['romanization']}],
            'head_templates': [{'args': {'1': 'تُجْهَدُونَ', '2': 'I'},
                                'expansion': 'تُجْهَدُونَ • (tujhadūna) (form '
                                             'I)',
                                'name': 'ar-verb-form'}],
            'lang': 'Arabic',
            'lang_code': 'ar',
            'pos': 'verb',
            'senses': [{'form_of': [{'extra': 'jahada', 'word': 'جَهَدَ'}],
                        'glosses': ['second-person masculine plural non-past '
                                    'passive indicative of جَهَدَ (jahada)'],
                        'links': [['جَهَدَ', 'جهد#Arabic']],
                        'tags': 

# 4.3 View Wikitionary By Index

In [54]:
wiki_idx = 10506
pprint(wikitionary[wiki_idx])

{'اسلنطح': {'categories': ['Arabic form-IIIq verbs',
                           'Arabic lemmas',
                           'Arabic sound form-IIIq verbs',
                           'Arabic sound verbs',
                           'Arabic sound verbs by conjugation',
                           'Arabic verbs',
                           'Arabic verbs with quadriliteral roots',
                           'Terms with redundant transliterations',
                           'Terms with redundant transliterations/ar'],
            'etymology_templates': [{'args': {'1': 'س ل ط ح'},
                                     'expansion': 'س ل ط ح (s-l-ṭ-ḥ)',
                                     'name': 'ar-root'},
                                    {'args': {'1': 'ar',
                                              '2': 'سُلْطُح',
                                              '3': '',
                                              '4': 'slanting, rising '
                                            

# 5.0 Check Values in Wikitionaries

## 5.1 Check Values in Categories

- List `categories` values in:
  - `WIKITIONARY_DICT["categories"]: List[str]`
  - `Senses[i]["categories"]` where `senses` is `WIKITIONARY_DICT["senses"]: List[Dict[str: List[str]]]`


In [53]:
import re

pattern = r'^(I|II|III|IV|V|VI|VII|VIII|IX|X|XI)$'


categories_values = set()
verb_forms = set()
for i, entry in enumerate(wikitionary):
    verb_form = []
    for _, word_data in entry.items():
        values = word_data.get("categories", [])
        if word_data["pos"] != "verb":
            continue
        for value in values:
            if match := re.search(r"(?<=form-)\w+(?= verbs)", value):
                if match.group() == "IIIq":
                    print(i)
                verb_form.append(match.group())
            categories_values.update(values)
    verb_forms.update([" ".join(verb_form)])
        # for sense in word_data.get("senses", []):
        #     if categories_senses := sense.get("categories", []):
        #         categories_values.update(categories_senses)

# pprint(sorted(list(categories_values)))
pprint(verb_forms)

10506
10506
22473
22473
23260
23260
24582
24582
24596
24596
{'',
 'I I',
 'I I I',
 'I I I I',
 'I I I I I',
 'I I I I I I',
 'I I I I I I I',
 'I I II II I II',
 'I I III III I',
 'I I IV IV I',
 'I II I I I II II II I II',
 'I II I I II',
 'I II I I II II',
 'I II I II',
 'I II I II I I I II II',
 'I II I II I I I II II II',
 'I II I II II',
 'II II',
 'II II II',
 'II II II II',
 'II II II II II',
 'II II II II II II',
 'II II IV IV II',
 'III III',
 'III III III',
 'III III III III',
 'III III III III III',
 'III III IV IV III IV',
 'III IV III III III IV IV IV III IV',
 'IIIq IIIq',
 'IIq IIq',
 'IIq IIq IIq',
 'IV IV',
 'IV IV IV',
 'IV IV IV IV',
 'IV IV IV IV IV',
 'IV Iq Iq Iq IV',
 'IVq IVq',
 'IVq IVq IVq',
 'IX IX',
 'Iq Iq',
 'Iq Iq Iq',
 'Iq Iq Iq Iq',
 'Iq V Iq V',
 'V V',
 'V V V',
 'V V V V',
 'V V V V V',
 'VI VI',
 'VI VI VI',
 'VI VI VI VI VI',
 'VII VII',
 'VII VII VII',
 'VII VII VII VII VII',
 'VIII VIII',
 'VIII VIII VIII',
 'VIII VIII VIII VIII',
 'VIII VIII VI