In [None]:
from wikibaseintegrator import WikibaseIntegrator, wbi_login, datatypes
from wikibaseintegrator.models import Reference, References, Form, Sense
from wikibaseintegrator.models.qualifiers import Qualifiers
from wikibaseintegrator.wbi_config import config
import time
import pandas as pd

In [None]:
WDUSER = ''
WDPASS = ''

In [None]:
config['MEDIAWIKI_API_URL'] = 'https://www.wikidata.org/w/api.php'
config['USER_AGENT'] = 'Lexeme Write Notebook'

In [None]:
def setup_wikidata_connection():
    login = wbi_login.Login(user=WDUSER, password=WDPASS, mediawiki_api_url='https://www.wikidata.org/w/api.php')
    wbi = WikibaseIntegrator(login=login)
    return wbi

In [None]:
# Setup Wikidata connection
wbi = setup_wikidata_connection()

In [None]:
## removeHieroglyphicControls function
## removes alignment control chars, all variation selectors, and zero width non joiner from string
## param
##   STRING text: Unicode string
## return
##   STRING cleared text
def removeHieroglyphicControls (text):
    if not text:
        return ""
    
    textAsList = list(text)
    
    for i, sign in enumerate(textAsList):
        # alignment controls and (all) variation selectors
        if "\U00013430" <= sign and sign <= "\U00013440" \
            or "\ufe00" <= sign and sign <= "\ufe0f" \
            or sign == "\u200b": 
            textAsList[i] = '' # delete controls, VS, ZWNJ at this position
    
    text = ''.join(textAsList) # concat list back to coherent string
    
    return text

In [None]:
# Map POS to Wikidata item IDs
main_pos_map = {
    # Add more POS mappings as needed
    "adjective|nisbe_adjective_preposition": "Q34698", # adjective
    "adjective|nisbe_adjective_substantive": "Q34698", # adjective
    "adjective|None": "Q34698", # adjective
    "adjective": "Q34698", # adjective
    
    "adverb|None": "Q380057", # adverb
    "adverb": "Q380057", # adverb
    "adverb|prepositional_adverb": "Q380057", # adverb
    
    "entity_name": "Q147276", # proper noun
    "entity_name|artifact_name": "Q147276", # proper noun
    "entity_name|gods_name": "Q147276", # proper noun
    "entity_name|org_name": "Q147276", # proper noun
    "entity_name|kings_name": "Q147276", # proper noun    
    "entity_name|place_name": "Q147276", # proper noun
    
    "epitheton_title|epith_god": "Q1084", # noun
    "epitheton_title|epith_king": "Q1084", # noun
    "epitheton_title|title": "Q1084", # noun
    
    "interjection|None": "Q83034", # interjection
    "interjection": "Q83034", # interjection
    
    "numeral": "Q63116", # numeral
    "numeral|None": "Q63116", # numeral
    "numeral|cardinal": "Q63116", # numeral
    "numeral|ordinal": "Q63116", # numeral
    
    "particle": "Q184943", # grammatical particle
    "particle|None": "Q184943", # grammatical particle
    "particle|particle_enclitic": "Q184943", # grammatical particle
    "particle|particle_nonenclitic": "Q184943", # grammatical particle
    
    "preposition": "Q4833830", # preposition
    "preposition|None": "Q4833830", # preposition
    
    "pronoun": "Q36224", # pronoun
    "pronoun|None": "Q36224", # pronoun
    "pronoun|demonstrative_pronoun": "Q36224", # pronoun
    "pronoun|interrogative_pronoun": "Q36224", # pronoun
    "pronoun|personal_pronoun": "Q36224", # pronoun
    "pronoun|relative_pronoun": "Q36224", # pronoun
    
    "substantive": "Q1084", # noun
    "substantive|None": "Q1084", # noun
    "substantive|substantive_fem": "Q1084", # noun
    "substantive|substantive_masc": "Q1084", # noun
    
    "verb": "Q24905", # verb
    "verb|None": "Q24905", # verb
    "verb|verb_2-gem": "Q24905", # verb
    "verb|verb_2-lit": "Q24905", # verb
    "verb|verb_3-gem": "Q24905", # verb
    "verb|verb_3-inf": "Q24905", # verb
    "verb|verb_3-lit": "Q24905", # verb
    "verb|verb_4-inf": "Q24905", # verb
    "verb|verb_4-lit": "Q24905", # verb
    "verb|verb_5-inf": "Q24905", # verb
    "verb|verb_5-lit": "Q24905", # verb
    "verb|verb_6-lit": "Q24905", # verb
    "verb|verb_caus_2-gem": "Q24905", # verb
    "verb|verb_caus_2-lit": "Q24905", # verb
    "verb|verb_caus_3-gem": "Q24905", # verb
    "verb|verb_caus_3-inf": "Q24905", # verb
    "verb|verb_caus_3-lit": "Q24905", # verb
    "verb|verb_caus_4-inf": "Q24905", # verb
    "verb|verb_caus_4-lit": "Q24905", # verb
    "verb|verb_caus_5-lit": "Q24905", # verb
    "verb|verb_irr": "Q24905", # verb
}

# Cf. https://www.wikidata.org/wiki/Wikidata:Lexicographical_data/Documentation#Lexical_category
additional_pos_map = { # P31 instance of
    "adjective|nisbe_adjective_preposition": "Q1054545", # nisba
    "adjective|nisbe_adjective_substantive": "Q1054545", # nisba
    "adverb|prepositional_adverb": "Q2034977", # prepositional adverb
    "entity_name|gods_name": "Q115642037", # deity name
    "entity_name|kings_name": "Q115642102", # royal name
    "entity_name|place_name": "Q7884789", # toponym
    "epitheton_title|epith_god": "Q207869", # epithet
    "epitheton_title|epith_king": "Q207869", # epithet
    "epitheton_title|title": "Q216353", # title
    "numeral|cardinal": "Q1329258", # cardinal numeral
    "numeral|ordinal": "Q923933", # ordinal numeral
    "particle|particle_enclitic": "Q213458", # clitic
    "pronoun|demonstrative_pronoun": "Q34793275", # demonstrative pronoun
    "pronoun|interrogative_pronoun": "Q54310231", # interrogative pronoun
    "pronoun|personal_pronoun": "Q468801", # personal pronoun
    "pronoun|relative_pronoun": "Q1050744", # relative pronoun
    "verb|verb_caus_2-gem": "Q11870658", # causative verb
    "verb|verb_caus_2-lit": "Q11870658", # causative verb
    "verb|verb_caus_3-gem": "Q11870658", # causative verb
    "verb|verb_caus_3-inf": "Q11870658", # causative verb
    "verb|verb_caus_3-lit": "Q11870658", # causative verb
    "verb|verb_caus_4-inf": "Q11870658", # causative verb
    "verb|verb_caus_4-lit": "Q11870658", # causative verb
    "verb|verb_caus_5-lit": "Q11870658", # causative verb
    "verb|verb_irr": "Q70235", # irregular verb
}

pos_referent_map = { # P10476 identifies
    "entity_name|artifact_name": "Q16686448", # artificial object
    "entity_name|org_name": "Q43229", # organization
}

# Map gender to Wikidata item IDs
gender_map = { # P5185 grammatical gender
    'substantive|substantive_masc': 'Q499327',
    'substantive|substantive_fem': 'Q1775415',
}

# Map simple POS and phrases
phrase_map = { # P31 instance of
    'Q1084': 'Q1401131', # noun / noun phrase
    'Q147276': 'Q1401131', # proper noun / noun phrase
    'Q24905': 'Q1778442', # verb / verb phrase
    'Q34698': 'Q357760', # adjective / adjectival phrase
    'Q380057': 'Q3734650', # adverb / adverbial phrase
    'Q4833830': 'Q56042915', # preposition / prepositional phrase
}

In [None]:
def create_TLA_claim_refs (tla_id):
    claim_references = References()  # Create a group of references
    
    claim_reference1 = Reference()
    claim_reference1.add(datatypes.Item(prop_nr='P248', value='Q122748326')) # P248 stated in: TLA
    claim_references.add(claim_reference1)

    return claim_references

def create_TLA_claim (tla_id):
    claim_references = References()  # Create a group of references
    
    if tla_id:
        claim_reference2 = Reference()
        claim_reference2.add(datatypes.String(prop_nr='P854', value="https://thesaurus-linguae-aegyptiae.de/lemma/"+str(tla_id)))
        claim_references.add(claim_reference2)

    tla_claim = datatypes.Item(prop_nr='P1343', value='Q122748326', references=claim_references) # P1343 described by source: TLA

    return tla_claim

In [None]:
# Read Excel data file
tla_id_row_name = "ID"
state_row_name = "INCLUDE"
transliteration_row_name = "TRANSLITERATION_LUT"
hieroglyphs_row_name = "HIEROGLYPHS"
pos_row_name = "POS"
senses_row_name_en = "EN"
senses_row_name_de = "DE"

dataframe = pd.read_excel('egy_lemmata_for_Wikidata-test.xlsx', dtype=str)
lemmata_set = dataframe.where(pd.notnull(dataframe), None).to_dict(orient='records')

print(lemmata_set[:5])

In [None]:
# Process each row
results = []
index = 0

for row in lemmata_set:
    index = index +1

    transliteration = None
    tla_id = None
      
    try:
        # Get and check main data
        state = row.get(state_row_name, None)
        if not (state == True or state == "True"):
            continue  # no entry without state TRUE
            
        tla_id = row.get(tla_id_row_name, None)
        if not tla_id:
            continue  # no entry without TLA ID

        pos = row.get(pos_row_name, None)
        if not pos:
            continue  # no entry without part of speech
            
        lexical_category_id = main_pos_map.get(pos, None)
        if lexical_category_id == None:
            continue # no create without part of 
            
        transliteration = row.get(transliteration_row_name, None)
        if not transliteration:
            continue # no entry without transliteration

        # create TLA claim refs
        TLA_claim_references = create_TLA_claim_refs(tla_id)
        TLA_claim = create_TLA_claim(tla_id)

        # Compute basic language
        if isinstance(tla_id, str) and tla_id.startswith('d'):
            lang_id = "Q36765" # Demotic
        else:
            lang_id = "Q50868" # Egyptian

        # Create lexeme
        lexeme = wbi.lexeme.new(language=lang_id, lexical_category=lexical_category_id)
        
        lexeme.lemmas.set(language='egy-x-Q131362896', value=transliteration) # Q131362896 Leiden Unified Transliteration

        # Add lemma in hieroglyphs
        hieroglyphs = row.get(hieroglyphs_row_name, None)    
        if hieroglyphs: # not None or empty
            hieroglyphs = removeHieroglyphicControls(hieroglyphs) # Remove controls
            #print(hieroglyphs
            if hieroglyphs:
                lexeme.lemmas.set(language='egy-x-Q132659', value=hieroglyphs) # Q132659 Egyptian hieroglyphs      

        # Add additional POS
        additional_lexical_category_id = additional_pos_map.get(pos, None)
        if additional_lexical_category_id: 
            specific_pos_claim = datatypes.Item(prop_nr='P31', value=additional_lexical_category_id, references=TLA_claim_references) # P31 instance of
            lexeme.claims.add(specific_pos_claim)

        # Add phrase information
        if "-" in transliteration:
            phrase_id = phrase_map.get(lexical_category_id, None)
            if phrase_id:
                phrase_claim = datatypes.Item(prop_nr='P31', value=phrase_id) # P31 instance of
                lexeme.claims.add(phrase_claim, action_if_exists=ActionIfExists.FORCE_APPEND)
            
        # Add grammatical gender
        gender_id = gender_map.get(pos, None)
        if gender_id:
            gender_claim = datatypes.Item(prop_nr='P5185', value=gender_id, references=TLA_claim_references) # P5185 grammatical gender 
            lexeme.claims.add(gender_claim)            

        # Add POS referent
        referent_type_id = pos_referent_map.get(pos, None)
        if referent_type_id:
            pos_referent_claim = datatypes.Item(prop_nr='P10476', value=referent_type_id, references=TLA_claim_references) # P10476 identifies
            lexeme.claims.add(pos_referent_claim)      
        
        # Add senses, en
        senses_str = row.get(senses_row_name_en, None)
        if senses_str:
            senses = senses_str.split(';')
            for sense in senses:
                if sense:
                    sense = sense.strip() # clean translation
                    sense = sense.replace('\r\n', ' ')
                    sense = sense.replace('\r', ' ')
                    sense = sense.replace('\n', ' ')
                    sense = sense.replace('\t', ' ')
                    if len(sense) > 0:
                        sense_object = Sense()
                        sense_object.glosses.set(language='en', value=sense)
                        sense_object.claims.add(TLA_claim)
                        lexeme.senses.add(sense_object)
            
        # Add TLA Identifier
        tla_id_claim = datatypes.String(prop_nr='P12188', value=tla_id) # P12188 TLA lemma ID
        lexeme.claims.add(tla_id_claim)
        
        lexeme.write()
        print(tla_id, transliteration, pos, lexeme.id)
        results.append({'row': index, 'status': 'success', 'lexeme_id': str(tla_id), 'name': str(transliteration), 'wikidata_id': lexeme.id})

    except Exception as e:
        print("Error:", str(tla_id or "n.v."), str(transliteration or "n.v."))
        results.append({'row': index, 'status': 'error', 'lexeme_id': str(tla_id or "n.v."), 'name': str(transliteration or "n.v."), 'error': str(e)})

# Save result log to file
pd.DataFrame(results).to_csv('wikidata_upload_results.csv', index=False)