# Process Wikitionary Dump File

## Imports

In [7]:
import gzip
import json
from collections import defaultdict
from typing import List
from unicodedata import normalize

from pyarabic.araby import DIACRITICS, SHADDA, name
from tqdm.notebook import tqdm

## Define Paths

In [8]:
gzip_inpath = "../../data/raw/wikidata/raw-wiktextract-data.json.gz"
gzip_outpath = "../../data/processed/wikidata/ar-wiktextract-data.json.gz"
gzip_reindexedpath = "../../data/processed/wikidata/ar_reindex.json.gz"

## Define Variables

In [9]:
DIACRITICS = set(DIACRITICS)

## Define Utilities Functions

In [10]:
def reorder_shadda(ar_string: List[str]) -> str:
    """unicodedata.normalize put shadda before diacritics"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char == SHADDA and next_char in DIACRITICS:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [11]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## Load Dump File and Reindexing

- Download dump file from [here](https://kaikki.org/dictionary/rawdata.html)\
  Thanks: Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,\
  Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.

- Load raw wikitionary dump file:
  - Extract Arabic words
  - Reindexing `json` object to be keyed with the Arabic words
  - Save the line number for easier acess in future for each word
    - Words can be repeated

In [13]:
# Process the data and gather it in a list
data_to_write = []
words_reindexed = defaultdict(list)
indx = 0

with gzip.open(gzip_inpath, "rt", encoding="utf-8") as gzip_inobj:
    for i, line in enumerate(gzip_inobj):
        inobj = json.loads(line)
        if inobj.get("lang_code", "") == "ar" and inobj.get("word"):
            word = inobj["word"]
            ar_dict = {k: v for k, v in inobj.items() if k != "word"}
            word = normalize_ar(word)
            words_reindexed[word].append(indx)
            indx += 1

            # Serialize the new dictionary structure and add it to the list
            data_to_write.append(json.dumps({word: ar_dict}))
        if (i % 999) == 0:
            print(f"Reading Line: {i}", end="\r")

# Write the list to the GZIP file
# with gzip.open(gzip_outpath, "wt", encoding="utf-8") as gzip_outobj:
#     for entry in tqdm(data_to_write):
#         gzip_outobj.write(entry + "\n")

# # Serialize and write the reindex data
# with gzip.open(gzip_reindexedpath, "wt", encoding="utf-8") as gzip_obj:
#     json_reindex = json.dumps(words_reindexed)
#     gzip_obj.write(json_reindex)

{'wikipedia', 'instances', 'synonyms', 'source', 'antonyms', 'hyponyms', 'redirect', 'proverbs', 'related', 'troponyms', 'title', 'holonyms', 'abbreviations', 'translations', 'coordinate_terms', 'form_of', 'head_templates', 'pos', 'categories', 'redirects', 'inflection_templates', 'sounds', 'hyphenation', 'etymology_text', 'descendants', 'derived', 'senses', 'lang', 'forms', 'topics', 'lang_code', 'meronyms', 'wikidata', 'etymology_number', 'word', 'alt_of', 'etymology_templates', 'hypernyms'}


## Check the reindexed files size

In [13]:
op_dir = f'{"/".join(gzip_outpath.split("/")[:-1])}/*'
!du -h {op_dir}

26M	../data/processed/wikidata/ar-wiktextract-data.json.gz
556K	../data/processed/wikidata/ar_reindex.json.gz


As the produced files are small, they could be loaded to the PC RAM.