 # Import Libraries

In [1]:
from typing import Tuple, Union

import os
from multiprocessing import Pool
import requests

from collections import Counter
import pickle
import re

from math import ceil
import random
from statsmodels.stats.proportion import samplesize_confint_proportion

from tqdm.notebook import tqdm
from pprint import pprint

from ism_ar import Ism, IsmDict
from utilities import remove_diacritics_ar, Status

random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)


 # Reading Arabic Conll-U

In [2]:
data_file = "../../outputs/lexicons_ar/asmaa.pkl"
results_dir = "../../outputs/lexicons_ar/wikitionary_ar/"


In [3]:
with open(data_file, 'rb') as f:
    nouns, adjs, xs, sus_plurals = pickle.load(f)
    nouns: IsmDict
    adjs: IsmDict
    xs: IsmDict
    sus_plurals: IsmDict


 # Wikitinary Parsing Functions

In [4]:
def get_entry_data(entry: str):
    """
    Retrieves parsed wikitext data for an Arabic word from the English
    Wiktionary API.

    Args:
        entry: A string representing the Arabic word to look up.

    Returns:
        The JSON response from the Wiktionary API, as a Python dictionary, if
        the request is successful.
        False if the request fails after three attempts.
    """

    # Define the API endpoint URL, which includes a placeholder for the word to
    # look up.
    url = 'https://en.wiktionary.org/w/api.php?action=parse&page={}&prop=wikitext&formatversion=2&format=json'  # noqa: E501

    # Make three attempts to make a GET request to the API with the word to
    # look up. If any of the requests succeeds and returns valid JSON data,
    # return it. If all three requests fail (due to a network error
    # or an exception), return False to indicate that the request failed.
    for _ in range(3):
        try:
            r = requests.get(url.format(entry), timeout=60).json()
            return r
        except Exception:
            pass

    return False


In [5]:
def get_parsed_data(ism: Ism) -> Tuple[Status, Union[str, None], Ism]:
    """
    Retrieves parsed data for an Arabic word, given an Ism object.

    Args:
        ism: An Ism object that represents the Arabic word to look up.

    Returns:
        A tuple with three elements:
            - A Status object that represents the status of the request.
            - A dictionary containing the parsed wikitext data for the word, if
            it was found.
            - The original Ism object.
    """

    # Define a message to use when a page is not found.
    NOT_FOUND_MSG = "The page you specified doesn't exist."
    arabic_regex = re.compile(r'==Arabic==\n.+?(?===\w+==\n|<<END>>)',
                              flags=re.S)

    # If the Ism object does not have the respective pliral form, look up the
    # word.
    if not ism.plural:
        # Normalize the word by removing diacritics.
        lemma = ism.lemma
        lemma_nodiac = remove_diacritics_ar(lemma)

        # Look up the word by using its lemma in the English Wiktionary.
        data = get_entry_data(lemma_nodiac)

        # If the word was found, retrieve its parsed wikitext data.
        if data:

            # the outputcould be the parsed wikitext or an error message.
            # If the parsed data exists and contains wikitext, return it.
            parsed = data.get('parse')
            if parsed:
                if parsed['wikitext']:
                    wikitext = parsed['wikitext'] + '\n<<END>>\n'
                    wikitext_ar = arabic_regex.search(wikitext)
                    if wikitext_ar is None:
                        return Status.NoArabic, None, ism
                    return Status.EntryFound, wikitext_ar.group(), ism

            # retrieving parsing wikitext was not sucssefull. Try to look by
            # the word form itself. Most probabily the lemma was wrong.
            else:
                # tru again using word form removing definite article prefix
                # (al-)
                word = remove_diacritics_ar(ism.form)
                word = re.sub('^ال', "", word)
                data = get_entry_data(word)

                # If the parsed data does not exist, check the error message
                if data:
                    # If the word was found and its parsed data contains
                    # wikitext, return it.
                    parsed = data.get('parse')
                    if parsed:
                        if parsed['wikitext']:
                            wikitext = parsed['wikitext'] + '\n<<END>>\n'
                            wikitext_ar = arabic_regex.search(wikitext)
                            if wikitext_ar is None:
                                return Status.NoArabic, None, ism
                            return Status.EntryFound, wikitext_ar.group(), ism

                    # If the parsed data does not exist, check the error
                    # message
                    else:
                        error = data.get('error')
                        if error:
                            error_msg: str = error['info']
                            if error_msg == NOT_FOUND_MSG:
                                return Status.EntryNotFound, None, ism
                            elif error_msg.startswith('Bad title'):
                                return Status.BadTitle, None, ism
                            elif error_msg.startswith('There is no section 1'):
                                return Status.NoArabic, None, ism

                            # error is unknown. Get error messege.
                            else:
                                msg = "_".join(error_msg.split())
                                Status.add_error(msg)
                                status = getattr(Status, msg)
                                return status, None, ism

                        # the data returned from API does not contains the
                        # parsed data or an error messege. Return unknown
                        # error.
                        return Status.UNKOWN, None, ism

        # API request was not sucssefull.
        return Status.RequestException, None, ism

    # Plural is aready exists, no need to use Wikitionary.
    else:
        return Status.PluralExist, None, ism


 # Wikitionary Parsing

 ## Nouns Parsing

In [6]:
with Pool(processes=15) as pool:
    noun_parsed = list(
        tqdm(pool.imap(get_parsed_data, nouns), total=len(nouns)))


  0%|          | 0/2654 [00:00<?, ?it/s]

In [7]:
statuses, parsed, isms = zip(*noun_parsed)
statuses_counter = Counter(statuses)
pprint(statuses_counter)


Counter({<Status.EntryFound: 1>: 1928,
         <Status.EntryNotFound: 2>: 445,
         <Status.PluralExist: 4>: 240,
         <Status.NoArabic: 7>: 41})


In [8]:
file_name = f'{results_dir}nouns_wikitionary'
with open(f'{file_name}.pkl', "wb") as f:
    pickle.dump(noun_parsed, f)


In [9]:
with Pool(processes=15) as pool:
    adj_parsed = list(tqdm(pool.imap(get_parsed_data, adjs), total=len(adjs)))


  0%|          | 0/1545 [00:00<?, ?it/s]

In [10]:
statuses, parsed, isms = zip(*adj_parsed)
pprint(Counter(statuses))


Counter({<Status.EntryFound: 1>: 943,
         <Status.EntryNotFound: 2>: 532,
         <Status.PluralExist: 4>: 49,
         <Status.NoArabic: 7>: 21})


In [11]:
file_name = f'{results_dir}adjectives_wikitionary'
with open(f'{file_name}.pkl', "wb") as f:
    pickle.dump(adj_parsed, f)


In [12]:
with Pool(processes=15) as pool:
    x_parsed = list(tqdm(pool.imap(get_parsed_data, xs), total=len(xs)))


  0%|          | 0/2940 [00:00<?, ?it/s]

In [13]:
statuses, parsed, isms = zip(*x_parsed)
pprint(Counter(statuses))


Counter({<Status.EntryNotFound: 2>: 2190,
         <Status.EntryFound: 1>: 659,
         <Status.NoArabic: 7>: 91})


In [14]:
file_name = f'{results_dir}X_wikitionary'
with open(f'{file_name}.pkl', "wb") as f:
    pickle.dump(x_parsed, f)
