In [1]:
from typing import Tuple, Union, List

import pickle
import re

from ism_ar import Ism, IsmDict
from utilities import Status

from pyarabic import araby


In [2]:
ismdict_path = '../../outputs/lexicons_ar/asmaa.pkl'
data_dir = '../../outputs/lexicons_ar/wikitionary_ar/'
results_dir = '../../outputs/lexicons_ar/wikitionary_ar/'


In [3]:
file_name = f'{results_dir}nouns_wikitionary'
with open(f'{file_name}.pkl', "rb") as f:
    nouns_wiki = pickle.load(f)

file_name = f'{results_dir}adjectives_wikitionary'
with open(f'{file_name}.pkl', "rb") as f:
    adjs_wiki = pickle.load(f)

file_name = f'{results_dir}X_wikitionary'
with open(f'{file_name}.pkl', "rb") as f:
    xs_wiki = pickle.load(f)

with open(ismdict_path, 'rb') as f:
    nouns, adjs, xs, sus_plurals = pickle.load(f)

nouns_wiki: List[Tuple[Status, Union[str, None], Ism]]
adjs_wiki: List[Tuple[Status, Union[str, None], Ism]]
xs_wiki: List[Tuple[Status, Union[str, None], Ism]]
nouns: IsmDict
adjs: IsmDict
xs: IsmDict
sus_plurals: IsmDict


In [4]:
noun_section_regex = re.compile(
    r'''(?:                      # positive lookbehind assertions
            (?<=====Noun====\n)  # Start looking for third or forth level
            |                    # section headers that are used to identify
            (?<====Noun===\n)    # noun sections in the wikitext
        )
        .+?                      # Match any chars non-greedily from then until
        (?=(\n?===\w+===\n)      # positive lookahead assertion
            |                    # mathes either the next section header
        <<END>>\n)               # or end of the file.
    ''',
    flags=re.S | re.VERBOSE)

adj_section_re = re.compile(
    r'''(?:                           # positive lookbehind assertions
            (?<=====Adjective====\n)  # Start looking for third or forth level
            |                         # section headers that are used to
            (?<====Adjective===\n)    # identify adj sections in the wikitext
        )
        .+?                      # Match any chars non-greedily from then until
        (?=(\n?===\w+===\n)      # positive lookahead assertion
            |                    # mathes either the next section header
        <<END>>\n)               # or end of the file.
    ''',
    flags=re.S | re.VERBOSE)

noun_template_re = re.compile(r'{{ar-noun\|.+?}}')
noun_dec_template_re = re.compile(
    r'{{ar-decl-(gendered-)?(coll-)?(sing-)?noun\|.+?}}')
noun_plural_re = re.compile('{{ar-noun-(pl|dual)\|.+?}}')

adj_template_re = re.compile(r'{{ar-(adj|adjective)\|.+?}}')
adj_dec_template_re = re.compile(r'{{ar-decl-adj\|.+?}}')
adj_plural_re = re.compile('{{ar-adj-pl\|.+?}}')
adj_fem_re = re.compile(r'{{ar-adj-fem\|.+?}}')

lemma_temp_re = re.compile(r'(?<=\|)[\u0600-\u06FF]+')
gender_temp_re = re.compile(r'(?<=\|)(m|f)(?=\||}})')


In [5]:
# https://regex101.com/r/OOfoB2/1
count_adj = 0
count_noun = 0
count_not_ism = 0
count_lemma_diff = 0
count_gender_diff = 0

for status, wikitext, ism in nouns_wiki:
    if status == Status.EntryFound:
        wikitext_ = wikitext + '\n<<END>>\n'  # append end mark to wikitext
        # get the noun section in wikitext. If not found get the adjective
        # section
        if noun_section := noun_section_regex.search(wikitext_):
            count_noun += 1
            noun_section = noun_section.group()
            # There multiple tempelates in the noun section. get one
            # template if not found get for the net one.
            # 1. Noun template: {{ar-noun| }} template
            if noun_template := noun_template_re.search(noun_section):
                noun_template = noun_template.group()

                lemma_wiki = lemma_temp_re.search(noun_template).group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    print(lemma_wiki, ism.lemma)
                    count_lemma_diff += 1

                gender_wiki = gender_temp_re.search(noun_template)
                if not bool(gender_wiki):
                    if '{{ar-noun|تِقَانَة}}' == noun_template:
                        pass
                if ism.gender:
                    gender_wiki = gender_wiki.group()
                    if ism.gender[0].lower() != gender_wiki:
                        count_gender_diff += 1

                # ism.gender = gender_wiki if not ism.gender

            # 2. Noun declension template {{ar-noun| }} template
            elif noun_template := noun_dec_template_re.search(noun_section):
                pass

            # 3. Neither noun nor noun declension template are found. See if
            #    the noun is pluaral. i.e. feat is wrong in Conll-U.
            elif noun_plural := noun_plural_re.search(noun_section):
                pass

            # No templates are found in noun section!
            else:
                pass
        elif adj_section := adj_section_re.search(wikitext_):
            count_adj += 1
            adj_section = adj_section.group()
            # There multiple tempelates in the adjective section. get one
            # template if not found get for the net one.
            # 1. Adj template: {{ar-adj| }} or {{ar-adjectives| }} template
            if adj_template := adj_template_re.search(adj_section):
                adj_template = adj_template.group()

                # Two adjectives does not have an Arabic lemma
                lemma_wiki = lemma_temp_re.search(adj_template)
                if bool(lemma_wiki):
                    lemma_wiki = lemma_wiki.group()
                    if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                        count_lemma_diff += 1

                # gender_wiki = gender_temp_re.search(noun_template)
                # if not bool(gender_wiki):
                #     if '{{ar-noun|تِقَانَة}}' == noun_template:
                #         pass
                # if ism.gender:
                #     gender_wiki = gender_wiki.group()
                #     if ism.gender[0].lower() != gender_wiki:
                #         count_gender_diff += 1

            elif adj_template := adj_dec_template_re.search(adj_section):
                pass
            elif adj_plural := adj_plural_re.search(adj_section):
                pass
            elif adj_fem := adj_fem_re.search(adj_section):
                pass
                # go to the {{feminine singular of|ar|مُتَعَدِّد}
            else:
                pass
        else:
            count_not_ism += 1

total_records = count_noun + count_adj + count_not_ism
print('POS extracted from Wikitext')
print('================================================')
print(f'Noun Found:                      {count_noun}')
print(f'Adjectives Found:                {count_adj}')
print(f'Neither noun nor Adjectives:     {count_not_ism}')
print(f'total Nouns as shown in Conll-u: {total_records}')
print()
print('Discrepancies in features between Wikitext and Conll-U')
print('======================================================')
print(f'Differences in lemma:  {count_lemma_diff}')
print(f'Differences in gender: {count_gender_diff}')



لَافِتَة لَافِت
كَوْكَبَة كُوكبَة
مُسْتَوًى مُستَوَى
مُلَحِّن مَُلُحِ
نَبَالَة نِبَالَة
مُحَرِّر المٌحرِّر
مَبْنًى مَبنَى
مِفْتَاح مُفتَاح
رَقْم رَََّقُم
إِنْتَرْنَت إِنتِرنِت
سُلَالَة سَلَالَة
وَسْم وَسَم
مُحَرِّك مَحرَك
رُتْبَة رِتبَة
نِدَاء نَدَاء
مَوْسُوعَة مَوسُوع
مَبْنًى مَبنَى
مُنَظَّمَة مُُنظََِّم
لَوْن لُون
سِجِلّ سَجل
مِحْرَف مُِحَرَف
مُرَاجَعَة مُِرَاجِعَّة
مُحْتَرِف مُحتَرَف
مَرْجِع مُرَجَّع
مُكَوِّن مَُكُوِ
رَمْز رَََّمُز
مُكَوِّن مُكَوَّن
مُحَدِّث مُحَدَّث
طَبْعَة طَبَعَة
سَرْد سِرد
وَصْلَة وُصلَة
حَوْسَبَة حَوسِبَة
أَلِفْبَائِيَّة أُلفبَائِيّ
مَكْنَز مََكُنز
ضَبْط ضََبَط
مِهْنَة مَهَنَّة
طَاقَم طَاقِم
خَيَال خِيَال
صِمَام صَمَام
سَعَة سِعَّة
مُكَوِّن مُكُون
صِحَافَة صَحَافَة
خَيَال رِيَال
وِجْهَة وَجهَة
نِفْط نَفط
هِجَاء هَجَاء
دَرَّاجَة دِرَاجَة
قَالَب قَالِب
حَيَوَان حُيوَان
شِطْرَنْج شَطرَنج
مُمَثِّلَة مُمَثَّل
دَالَّة حَالَة
وَرْشَة وَرَشَة
مُعْجَم مَعجِم
مَرْأَة اِمرَأَة
مَكْتَب مُكَتَّب
سَطْر سَطَر
مَاشِيَة مَاشِي
مُسْتَوًى مُستَوَى
مَيَلَان مِيلَان
عَرْض عََرَض
زِ

In [6]:
count_adj = 0
count_noun = 0
count = 0
count_break = 0
for status, wikitext, ism in adjs_wiki:
    if status == Status.EntryFound:
        wikitext_ = wikitext + '\n<<END>>\n'
        if bool(adj_section_re.search(wikitext_)):
            count_noun += 1
            print(adj_section_re.search(wikitext_).group())
            print("---------------------------------------------")
            count_break += 1
            if count_break == 10:
                break
        elif bool(noun_section_regex.search(wikitext_)):
            count_adj += 1
        else:
            count += 1

print(count_noun, count_adj, count)
print(count_noun + count_adj + count)


{{ar-adj|سَرِيع|f=سَرِيعَة|cpl=سِرَاع|pl=سَرِيعُونَ|pl2=سُرْعَان|fpl=سَرِيعَات|el=أَسْرَع}}

# [[fast]] {{gloss|capable of moving with speed}}, [[quick]] {{gloss|moving with speed}}

====Declension====
{{ar-decl-adj|سَرِيع|f=سَرِيعَة|cpl=سِرَاع|pl=smp|pl2=سُرْعَان|fpl=sfp}}



---------------------------------------------
{{ar-adj-fem|رَسْمِيَّة}}

# {{feminine singular of|ar|رَسْمِيّ}}


---------------------------------------------
{{ar-nisba|رِيَاضِيّ}}
# of or [[pertain]]ing to [[sport]]s
#: {{l|ar|[[أَلْعَاب]] [[رِيَاضِيَّة]]|t=sports}}
#: {{l|ar|[[أَخْبَار]] [[رِيَاضِيَّة]]|t=sports news}}
#: {{l|ar|[[مُرَاسِل]] [[رِيَاضِيّ]]|t=sports reporter}}
## [[sportsmanly]]; [[sportsmanlike]]
##: {{l|ar|[[رُوح]] [[رِيَاضِيّة]]|t=a sportsmanly spirit}}
# of or [[pertain]]ing to [[physical]] [[sport]]s or [[athletics]]
## [[resemblant]] of [[athlete]]s, [[athletic]]
##: {{l|ar|[[نَمَط]] [[رِيَاضِيّ]]|t=an athletic type}}
##: {{l|ar|[[جَسَد]] [[رِيَاضِيّ]]|t=an athletic body}}
# [[mathematic]

In [7]:
count_adj = 0
count_noun = 0
count = 0
for status, wikitext, ism in xs_wiki:
    if status == Status.EntryFound:
        wikitext_ = wikitext + '\n<<END>>\n'
        if bool(noun_section_regex.search(wikitext_)):
            count_noun += 1
        elif bool(adj_section_re.search(wikitext_)):
            count_adj += 1
        else:
            count += 1

print(count_noun, count_adj, count)
print(count_noun + count_adj + count)


254 33 374
661


   ```python
   n = samplesize_confint_proportion(0.5, 0.05, 0.1)
   sample_size = ceil(n / (1 + ((n - 1) / len(statuses
   notfound_nouns = [
       noun for status, _, noun in zip(statuses, parsed, isms)
       if status == Status.EntryNotFound
   ]
   random_notfound_nouns = random.sample(notfound_nouns, sample_s
   notfound_info = ''
   for notfound_noun in notfound_nouns:
       idx, label = nouns[notfound_noun][0]
       notfound_info += f'{idx}\t[{label}]\t{notfound_noun.form}\t'
       notfound_info += f'{notfound_noun.lemma
   file_name = f'{results_dir}nouns_notfound.txt'
   with open(file_name, 'w') as f:
       f.write(notfound_info)
   ```

 {{ar-noun|رَئِيس|m|pl=رُؤَسَاء|f=رَئِيسَة}}
 {{ar-noun|طَرِيق|m|g2=f|pl=طُرُق|pl2=طُرُقَات}}
 {{ar-noun|حُكُومَة|f|pl=حُكُومَات}}
 {{ar-noun|لَافِتَة|f|pl=لَافِتَات}}
 {{ar-noun|خَرِيطَة|f|pl=خَرَائِط}}
 ====Declension====
 {{ar-decl-gendered-noun|رَئِيس|pl=رُؤَسَاء}}
 {{ar-decl-noun|حُكُومَة|pl=حُكُومَات}}
 {{ar-decl-noun|لَافِتَة|pl=sfp}}
 {{ar-decl-noun|طَرِيق|pl=طُرُق|pl2=طُرُقَات}}
 {{ar-decl-noun|خَرِيطَة|pl=خَرَائِط}}