## 1.0 Imports

In [1]:
import gzip
import json
from pathlib import Path
from pprint import pprint
from random import choice

## 2.0 Define Paths

In [2]:
# Path for preproceed Wikitionary
wiktionary_dir = Path("../../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [3]:
# Path to save words status
output_dir = Path("../../data/interim/")

# 3.0 Load Wikitionary Files

In [4]:
# 2. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-128636

In [5]:
# 3. load Indices
with gzip.open(wiki_indices_path, "rt", encoding="utf-8") as gzip_obj:
    wiki_idxs = json.load(gzip_obj)

# 4.0 View Dictionaries

## 4.1 Search for Arabic Word in Wiktionary

In [15]:
ar_word = "لاتفي"

In [16]:
if idxs := wiki_idxs.get(ar_word):
    for idx in idxs:
        print(idx)
        pprint(wikitionary[idx])
        print()

124663
{'لاتفي': {'categories': ['Arabic 2-syllable words',
                          'Arabic adjectives',
                          'Arabic lemmas',
                          'Arabic masculine nouns',
                          'Arabic nouns',
                          'Arabic relative nouns (nisba)',
                          'Arabic terms with IPA pronunciation',
                          'Terms with redundant transliterations',
                          'Terms with redundant transliterations/ar'],
           'etymology_templates': [{'args': {'1': 'ar',
                                             '2': 'relative adjectives '
                                                  '(nisba)'},
                                    'expansion': '',
                                    'name': 'catlangname'},
                                   {'args': {'1': 'ar',
                                             '2': 'لَاتْفِيَا',
                                             '3': 'ِيّ',
             

## 4.2 View Random Dictionary

View dictionaries to check where to find the information

In [8]:
# show sample example
random_word = choice(list(wiki_idxs.keys()))
random_word_idxs = wiki_idxs[random_word]
print("word, idx:", random_word, random_word_idxs)

word, idx: تك [119615, 119616, 119617]


In [9]:
pprint(wikitionary[choice(random_word_idxs)])

{'تك': {'categories': ['Arabic form-I verbs',
                       'Arabic geminate form-I verbs',
                       'Arabic geminate verbs',
                       'Arabic geminate verbs by conjugation',
                       'Arabic lemmas',
                       'Arabic non-lemma forms',
                       'Arabic verb forms',
                       'Arabic verbs',
                       'Terms with redundant transliterations',
                       'Terms with redundant transliterations/ar'],
        'etymology_number': 1,
        'etymology_templates': [{'args': {'1': 'ar'},
                                 'expansion': 'Onomatopoeic',
                                 'name': 'onomatopoeic'}],
        'etymology_text': 'Onomatopoeic.',
        'forms': [{'form': 'يَتُكُّ', 'roman': 'yatukku', 'tags': ['non-past']},
                  {'form': 'تَكَّ', 'tags': ['canonical']},
                  {'form': 'takka', 'tags': ['romanization']},
                  {'form': '', 

# 5.0 Check Values in Wikitionaries

## 5.1 Check Values in Categories

- List `categories` values in:
  - `WIKITIONARY_DICT["categories"]: List[str]`
  - `Senses[i]["categories"]` where `senses` is `WIKITIONARY_DICT["senses"]: List[Dict[str: List[str]]]`


In [10]:
categories_values = set()
for entry in wikitionary:
    for _, word_data in entry.items():
        values = word_data.get("categories", [])
        for value in values:
            categories_values.update(values)
        for sense in word_data.get("senses", []):
            if categories_senses := sense.get("categories", []):
                categories_values.update(categories_senses)

pprint(sorted(list(categories_values)))

['Algerian Arabic',
 'Algerian Arabic terms with IPA pronunciation',
 'Andalusian Arabic',
 'Andalusian Arabic terms with IPA pronunciation',
 'Arabic 1-syllable words',
 'Arabic 2-syllable words',
 'Arabic 3-syllable words',
 'Arabic 4-syllable words',
 'Arabic 5-syllable words',
 'Arabic 6-syllable words',
 'Arabic abbreviations',
 'Arabic acronyms',
 'Arabic act-related adverbs',
 'Arabic active participles',
 'Arabic adjective feminine forms',
 'Arabic adjective forms',
 'Arabic adjective plural forms',
 'Arabic adjectives',
 'Arabic adverbial accusatives',
 'Arabic adverbs',
 'Arabic archaic forms',
 'Arabic archaic terms',
 'Arabic articles',
 'Arabic assimilated form-I verbs',
 'Arabic assimilated form-II verbs',
 'Arabic assimilated form-III verbs',
 'Arabic assimilated form-IV verbs',
 'Arabic assimilated form-Iq verbs',
 'Arabic assimilated form-V verbs',
 'Arabic assimilated form-VI verbs',
 'Arabic assimilated form-VIII verbs',
 'Arabic assimilated form-X verbs',
 'Arabic a