In [1]:
from typing import Tuple, Union, List, Pattern
from copy import deepcopy

import pickle
import re

from ism_ar import Ism, IsmDict
from utilities import Status

from pyarabic import araby

from utilities import is_salem_plural


In [2]:
ismdict_path = '../../outputs/lexicons_ar/wikibase-item_quantity_time/asmaa.pkl'
data_dir = '../../outputs/lexicons_ar/wikibase-item_quantity_time/wikitionary_ar/'
results_dir = '../../outputs/lexicons_ar/wikibase-item_quantity_time/wikitionary_ar/'


In [3]:
file_name = f'{results_dir}nouns_wikitionary'
with open(f'{file_name}.pkl', "rb") as f:
    nouns_wiki = pickle.load(f)

file_name = f'{results_dir}adjectives_wikitionary'
with open(f'{file_name}.pkl', "rb") as f:
    adjs_wiki = pickle.load(f)

file_name = f'{results_dir}X_wikitionary'
with open(f'{file_name}.pkl', "rb") as f:
    xs_wiki = pickle.load(f)

with open(ismdict_path, 'rb') as f:
    nouns, adjs, xs, sus_plurals = pickle.load(f)

nouns_wiki: List[Tuple[Status, Union[str, None], Ism]]
adjs_wiki: List[Tuple[Status, Union[str, None], Ism]]
xs_wiki: List[Tuple[Status, Union[str, None], Ism]]
nouns: IsmDict
adjs: IsmDict
xs: IsmDict
sus_plurals: IsmDict


In [4]:
noun_section_regex = re.compile(
    r'''(?:                      # positive lookbehind assertions
            (?<=====Noun====\n)  # Start looking for third or forth level
            |                    # section headers that are used to identify
            (?<====Noun===\n)    # noun sections in the wikitext
        )
        .+?                      # Match any chars non-greedily from then until
        (?=(\n?===\w+===\n)      # positive lookahead assertion
            |                    # mathes either the next section header
        <<END>>\n)               # or end of the file.
    ''',
    flags=re.S | re.VERBOSE)

adj_section_re = re.compile(
    r'''(?:                           # positive lookbehind assertions
            (?<=====Adjective====\n)  # Start looking for third or forth level
            |                         # section headers that are used to
            (?<====Adjective===\n)    # identify adj sections in the wikitext
        )
        .+?                      # Match any chars non-greedily from then until
        (?=(\n?===\w+===\n)      # positive lookahead assertion
            |                    # mathes either the next section header
        <<END>>\n)               # or end of the file.
    ''',
    flags=re.S | re.VERBOSE)

noun_template_re = re.compile(r'{{ar-noun\|.+?}}')
noun_dec_template_re = re.compile(
    r'{{ar-decl-(gendered-)?(coll-)?(sing-)?noun\|.+?}}')
noun_plural_re = re.compile('{{ar-noun-(pl|dual)\|.+?}}')

adj_template_re = re.compile(r'{{ar-(adj|adjective)\|.+?}}')
adj_dec_template_re = re.compile(r'{{ar-decl-adj\|.+?}}')
adj_plural_re = re.compile('{{ar-adj-pl\|.+?}}')
adj_fem_re = re.compile(r'{{ar-adj-fem\|.+?}}')

lemma_temp_re = re.compile(r'(?<=\|)[\u0600-\u06FF]+')
gender_temp_re = re.compile(r'(?<=\|)(m|f)(?=\||}})')
plurals_temp_re = re.compile((r'(?:(?<=\|pl=)|(?<=\|pl\d=))[\u0600-\u06FF]+'))


In [13]:
# https://regex101.com/r/OOfoB2/1
count_adj = 0
count_noun = 0
count_not_ism = 0
count_lemma_diff = 0
count_gender_diff = 0
count_not_single = 0
count_plural_notfound = 0
count_fem_adjectives = 0

data = []
for status, wikitext, ism in nouns_wiki:
    if status == Status.EntryFound:
        wikitext_ = wikitext + '\n<<END>>\n'  # append end mark to wikitext
        # get the noun section in wikitext. If not found get the adjective
        # section

        if noun_section := noun_section_regex.search(wikitext_):
            noun_section = noun_section.group()
            count_noun += 1

            # There are multiple templates in the noun section. get one
            # template if not found get for the net one.
            # 1. Noun template: {{ar-noun| }} template
            # 2. Noun declension template {{ar-noun| }} template
            plurals = []
            for templat_re in [noun_template_re, noun_dec_template_re]:
                noun_template = templat_re.search(noun_section)
                if noun_template is None:
                    continue

                noun_template = noun_template.group()
                lemma_wiki = lemma_temp_re.search(noun_template).group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    count_lemma_diff += 1

                if gender_wiki := gender_temp_re.search(noun_template):
                    gender_wiki = gender_wiki.group()
                    if ism.gender:
                        if ism.gender[0].lower() != gender_wiki:
                            count_gender_diff += 1

                    ism.gender = 'Fem' if gender_wiki == 'f' else 'Masc'

                plurals = plurals_temp_re.findall(noun_template)
                if not plurals:
                    count_plural_notfound += 1
                    continue
                for plural in plurals:
                    if has_salem_plural := is_salem_plural(ism.form, plural):
                        break
                ism.update_plural(plurals, has_salem_plural)
                nouns.update_plurals(ism)
                break

            # most probably the pural is Salem plural
            if noun_template and not plurals:
                has_salem_plural = True
                ism.update_plural(plurals, has_salem_plural)

            # 3. Neither noun nor noun declension template are found. See if
            #    the noun is pluaral. i.e. feat is wrong in Conll-U.
            if noun_plural := noun_plural_re.search(noun_section):
                count_not_single += 1

            # No templates are found in noun section!
            else:
                pass

        # not found in the noun section. Maybe a adjective?
        elif adj_section := adj_section_re.search(wikitext_):
            adj_section = adj_section.group()
            count_adj += 1
            adj_templates: List[Pattern[str]] = [
                adj_template_re, adj_dec_template_re, adj_fem_re
            ]

            # There are multiple templates in the noun section. get one
            # template if not found get for the net one.
            # 1. Adj template: {{adj-noun| }} template
            # 2. Adj declension template {{adj-noun| }} template
            # 3. Feminen adjective template: {{feminine singular of}}
            plurals = []
            for i, templat_re in enumerate(adj_templates):
                adj_template = templat_re.search(adj_section)
                if adj_template is None:
                    continue

                adj_template = adj_template.group()
                lemma_wiki = lemma_temp_re.search(adj_template)
                # corrupted template # {{ar-adj|tr=badīl}}
                if not lemma_wiki:
                    count_plural_notfound += 1
                    continue
                lemma_wiki = lemma_wiki.group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    count_lemma_diff += 1

                # if found it is most probably a Mas adjective. Fem do have
                # another template and they link to the Masc page.
                if i < 2:
                    gender_wiki = 'Masc'
                else:
                    gender_wiki = 'Fem'
                    count_fem_adjectives += 1
                if ism.gender:
                    if ism.gender != gender_wiki:
                        count_gender_diff += 1
                ism.gender = gender_wiki

                plurals = plurals_temp_re.findall(adj_template)
                if not plurals:
                    count_plural_notfound += 1
                    continue
                for plural in plurals:
                    if has_salem_plural := is_salem_plural(ism.form, plural):
                        break
                ism.update_plural(plurals, has_salem_plural)
                break

            # most probably the pural is Salem plural
            if adj_template and not plurals:
                has_salem_plural = True
                ism.update_plural(plurals, has_salem_plural)

            elif adj_plural := adj_plural_re.search(adj_section):
                count_not_single += 1
            else:
                pass
        else:
            count_not_ism += 1

    ref_, _ = zip(*nouns[ism])
    data.append([
        ism.upos, ism.form, ism.lemma, "|".join(ism.plurals), ism.has_salem_pl,
        ism.gender, " ".join(ref_)
    ])

total_records = count_noun + count_adj + count_not_ism
print('POS extracted from Wikitext')
print('================================================')
print(f'Noun Found:                      {count_noun}')
print(f'Adjectives Found:                {count_adj}')
print(f'Neither noun nor adjectives:     {count_not_ism}')
print(f'total Nouns as shown in Conll-u: {total_records}')
print()
print(f'{count_not_single} may be a salem plural')
print(f'{count_not_ism + count_fem_adjectives} cannot extract there plural')
print()
print('Discrepancies in features between Wikitext and Conll-U')
print('======================================================')
print(f'Differences in lemma:  {count_lemma_diff}')
print(f'Differences in gender: {count_gender_diff}')


رئيس[ج]() []
الحكومة[ج]() []
نظام[ج]() []
الطريق[ج]() []
مكان[ج]() []
وفاة[ج]() []
الدولة[ج]() []
العاصمة[ج]() []
اللغة[ج]() []
العملة[ج]() []
المنصب[ج]() []
عضو[ج]() []
الفريق[ج]() []
الكوكبة[ج]() []
الاكتشاف[ج]() []
نطاق[ج]() []
المستوى[ج]() []
الخط[ج]() []
المُلحِّن[ج]() []
كلمة[ج]() []
النص[ج]() []
المٌحرِّر[ج]() []
مجال[ج]() []
المهنة[ج]() []
الرسام[ج]() []
وحدة[ج]() []
قياس[ج]() []
محور[ج]() []
تحالف[ج]() []
الملعب[ج]() []
العنصر[ج]() []
الحكم[ج]() []
المالك[ج]() []
التقسيم[ج]() []
الحركة[ج]() []
مشروع[ج]() []
تقسيم[ج]() []
الرئيس[ج]() []
ممثل[ج]() []
الصانع[ج]() []
المجموعة[ج]() []
الشَّركة[ج]() []
جزءٌ[ج]() []
سلسلة[ج]() []
المدينة[ج]() []
التوأم[ج]() []
المبنى[ج]() []
الهيئة[ج]() []
مجموعة[ج]() []
المحطة[ج]() []
المنظمة[ج]() []
البحيرة[ج]() []
الشعار[ج]() []
الفرع[ج]() []
شركة[ج]() []
الإنتاج[ج]() []
الرخصة[ج]() []
المكان[ج]() []
لغة[ج]() []
الكتابة[ج]() []
المُدرِّب[ج]() []
المصمم[ج]() []
فئة[ج]() []
السفينة[ج]() []
الموضوع[ج]() []
مدير[ج]() []
موضوع[ج]() []
القائمة[ج]() []
ج

In [10]:
count_adj = 0
count_noun = 0
count_not_ism = 0
count_lemma_diff = 0
count_gender_diff = 0
count_not_single = 0
count_plural_notfound = 0
count_fem_adjectives = 0

for status, wikitext, ism in adjs_wiki:
    if status == Status.EntryFound:
        wikitext_ = wikitext + '\n<<END>>\n'
        if adj_section := adj_section_re.search(wikitext_):
            adj_section = adj_section.group()
            count_adj += 1
            adj_templates: List[Pattern[str]] = [
                adj_template_re, adj_dec_template_re, adj_fem_re
            ]

            # There are multiple templates in the noun section. get one
            # template if not found get for the net one.
            # 1. Adj template: {{adj-noun| }} template
            # 2. Adj declension template {{adj-noun| }} template
            # 3. Feminen adjective template: {{feminine singular of}}
            plurals = []
            for i, templat_re in enumerate(adj_templates):
                adj_template = templat_re.search(adj_section)
                if adj_template is None:
                    continue

                adj_template = adj_template.group()
                lemma_wiki = lemma_temp_re.search(adj_template)
                # corrupted template # {{ar-adj|tr=badīl}}
                if not lemma_wiki:
                    count_plural_notfound += 1
                    continue
                lemma_wiki = lemma_wiki.group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    count_lemma_diff += 1

                # if found it is most probably a Mas adjective. Fem do have
                # another template and they link to the Masc page.
                if i < 2:
                    gender_wiki = 'Masc'
                else:
                    gender_wiki = 'Fem'
                    count_fem_adjectives += 1
                if ism.gender:
                    if ism.gender != gender_wiki:
                        count_gender_diff += 1
                ism.gender = gender_wiki

                plurals = plurals_temp_re.findall(adj_template)
                if not plurals:
                    count_plural_notfound += 1
                    continue
                for plural in plurals:
                    if has_salem_plural := is_salem_plural(ism.form, plural):
                        break
                ism.update_plural(plurals, has_salem_plural)

                break

            # most probably the pural is Salem plural
            if adj_template and not plurals:
                has_salem_plural = True
                ism.update_plural(plurals, has_salem_plural)

            elif adj_plural := adj_plural_re.search(adj_section):
                count_not_single += 1
            else:
                pass
        elif noun_section := noun_section_regex.search(wikitext_):
            noun_section = noun_section.group()
            count_noun += 1

            # There are multiple templates in the noun section. get one
            # template if not found get for the net one.
            # 1. Noun template: {{ar-noun| }} template
            # 2. Noun declension template {{ar-noun| }} template
            plurals = []
            for templat_re in [noun_template_re, noun_dec_template_re]:
                noun_template = templat_re.search(noun_section)
                if noun_template is None:
                    continue

                noun_template = noun_template.group()
                lemma_wiki = lemma_temp_re.search(noun_template)
                # corrupted template # {{ar-decl-noun|-|pl=إِلكترونِيَّات}}
                if not lemma_wiki:
                    continue
                lemma_wiki = lemma_wiki.group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    count_lemma_diff += 1

                if gender_wiki := gender_temp_re.search(noun_template):
                    gender_wiki = gender_wiki.group()
                    if ism.gender:
                        if ism.gender[0].lower() != gender_wiki:
                            count_gender_diff += 1

                    ism.gender = 'Fem' if gender_wiki == 'f' else 'Masc'

                plurals = plurals_temp_re.findall(noun_template)
                if not plurals:
                    count_plural_notfound += 1
                    continue
                for plural in plurals:
                    if has_salem_plural := is_salem_plural(ism.form, plural):
                        break
                ism.update_plural(plurals, has_salem_plural)

                break

            # most probably the pural is Salem plural
            if noun_template and not plurals:
                has_salem_plural = True
                ism.update_plural(plurals, has_salem_plural)

            # 3. Neither noun nor noun declension template are found. See if
            #    the noun is pluaral. i.e. feat is wrong in Conll-U.
            if noun_plural := noun_plural_re.search(noun_section):
                count_not_single += 1

            # No templates are found in noun section!
            else:
                pass

        else:
            count_not_ism += 1

    ref_, _ = zip(*adjs[ism])
    data.append([
        ism.upos, ism.form, ism.lemma, "|".join(ism.plurals), ism.has_salem_pl,
        ism.gender, " ".join(ref_)
    ])

total_records = count_noun + count_adj + count_not_ism
print('POS extracted from Wikitext')
print('================================================')
print(f'Noun Found:                      {count_noun}')
print(f'Adjectives Found:                {count_adj}')
print(f'Neither noun nor adjectives:     {count_not_ism}')
print(f'total Nouns as shown in Conll-u: {total_records}')
print()
print(f'{count_not_single} may be a salem plural')
print(f'{count_not_ism + count_fem_adjectives} cannot extract there plural')
print()
print('Discrepancies in features between Wikitext and Conll-U')
print('======================================================')
print(f'Differences in lemma:  {count_lemma_diff}')
print(f'Differences in gender: {count_gender_diff}')

POS extracted from Wikitext
Noun Found:                      30
Adjectives Found:                244
Neither noun nor adjectives:     112
total Nouns as shown in Conll-u: 386

1 may be a salem plural
172 cannot extract there plural

Discrepancies in features between Wikitext and Conll-U
Differences in lemma:  105
Differences in gender: 0


In [11]:
count_adj = 0
count_noun = 0
count_not_ism = 0
count_lemma_diff = 0
count_gender_diff = 0
count_not_single = 0
count_plural_notfound = 0
count_fem_adjectives = 0

for status, wikitext, ism in xs_wiki:
    if status == Status.EntryFound:
        wikitext_ = wikitext + '\n<<END>>\n'  # append end mark to wikitext
        # get the noun section in wikitext. If not found get the adjective
        # section

        if noun_section := noun_section_regex.search(wikitext_):
            noun_section = noun_section.group()
            count_noun += 1

            # There are multiple templates in the noun section. get one
            # template if not found get for the net one.
            # 1. Noun template: {{ar-noun| }} template
            # 2. Noun declension template {{ar-noun| }} template
            plurals = []
            for templat_re in [noun_template_re, noun_dec_template_re]:
                noun_template = templat_re.search(noun_section)
                if noun_template is None:
                    continue

                noun_template = noun_template.group()
                lemma_wiki = lemma_temp_re.search(noun_template).group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    count_lemma_diff += 1

                if gender_wiki := gender_temp_re.search(noun_template):
                    gender_wiki = gender_wiki.group()
                    if ism.gender:
                        if ism.gender[0].lower() != gender_wiki:
                            count_gender_diff += 1

                    ism.gender = 'Fem' if gender_wiki == 'f' else 'Masc'

                plurals = plurals_temp_re.findall(noun_template)
                if not plurals:
                    count_plural_notfound += 1
                    continue
                for plural in plurals:
                    if has_salem_plural := is_salem_plural(ism.form, plural):
                        break
                ism.update_plural(plurals, has_salem_plural)

                break

            # most probably the pural is Salem plural
            if noun_template and not plurals:
                has_salem_plural = True
                ism.update_plural(plurals, has_salem_plural)

            # 3. Neither noun nor noun declension template are found. See if
            #    the noun is pluaral. i.e. feat is wrong in Conll-U.
            if noun_plural := noun_plural_re.search(noun_section):
                count_not_single += 1

            # No templates are found in noun section!
            else:
                pass

        # not found in the noun section. Maybe a adjective?
        elif adj_section := adj_section_re.search(wikitext_):
            adj_section = adj_section.group()
            count_adj += 1
            adj_templates: List[Pattern[str]] = [
                adj_template_re, adj_dec_template_re, adj_fem_re
            ]

            # There are multiple templates in the noun section. get one
            # template if not found get for the net one.
            # 1. Adj template: {{adj-noun| }} template
            # 2. Adj declension template {{adj-noun| }} template
            # 3. Feminen adjective template: {{feminine singular of}}
            plurals = []
            for i, templat_re in enumerate(adj_templates):
                adj_template = templat_re.search(adj_section)
                if adj_template is None:
                    continue

                adj_template = adj_template.group()
                lemma_wiki = lemma_temp_re.search(adj_template)
                # corrupted template # {{ar-adj|tr=badīl}}
                if not lemma_wiki:
                    count_plural_notfound += 1
                    continue
                lemma_wiki = lemma_wiki.group()
                if not araby.vocalizedlike(lemma_wiki, ism.lemma):
                    count_lemma_diff += 1

                # if found it is most probably a Mas adjective. Fem do have
                # another template and they link to the Masc page.
                if i < 2:
                    gender_wiki = 'Masc'
                else:
                    gender_wiki = 'Fem'
                    count_fem_adjectives += 1
                if ism.gender:
                    if ism.gender != gender_wiki:
                        count_gender_diff += 1
                ism.gender = gender_wiki

                plurals = plurals_temp_re.findall(adj_template)
                if not plurals:
                    count_plural_notfound += 1
                    continue
                for plural in plurals:
                    if has_salem_plural := is_salem_plural(ism.form, plural):
                        break
                ism.update_plural(plurals, has_salem_plural)

                break

            # most probably the pural is Salem plural
            if adj_template and not plurals:
                has_salem_plural = True
                ism.update_plural(plurals, has_salem_plural)

            elif adj_plural := adj_plural_re.search(adj_section):
                count_not_single += 1
            else:
                pass
        else:
            count_not_ism += 1

    ref_, _ = zip(*xs[ism])
    data.append([
        ism.upos, ism.form, ism.lemma, "|".join(ism.plurals), ism.has_salem_pl,
        ism.gender, " ".join(ref_)
    ])

total_records = count_noun + count_adj + count_not_ism
print('POS extracted from Wikitext')
print('================================================')
print(f'Noun Found:                      {count_noun}')
print(f'Adjectives Found:                {count_adj}')
print(f'Neither noun nor adjectives:     {count_not_ism}')
print(f'total Nouns as shown in Conll-u: {total_records}')
print()
print(f'{count_not_single} may be a salem plural')
print(f'{count_not_ism + count_fem_adjectives} cannot extract there plural')
print()
print('Discrepancies in features between Wikitext and Conll-U')
print('======================================================')
print(f'Differences in lemma:  {count_lemma_diff}')
print(f'Differences in gender: {count_gender_diff}')

POS extracted from Wikitext
Noun Found:                      81
Adjectives Found:                10
Neither noun nor adjectives:     59
total Nouns as shown in Conll-u: 150

5 may be a salem plural
59 cannot extract there plural

Discrepancies in features between Wikitext and Conll-U
Differences in lemma:  2
Differences in gender: 0


In [12]:
import csv

# write to csv
fpath = '../../outputs/lexicons_ar/wikibase-item_quantity_time/lexical_ism.csv'
# Open the file in write mode
with open(fpath, mode='w', newline='') as file:

    # Create a writer object
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(
        ['UPOS', 'Form', 'Lemma', 'Plurals', 'isSalem', 'Gender', 'Ref'])
    for d in data:
        writer.writerow(d)
