# Word processing

In [30]:
import json
from collections import defaultdict

In [31]:
def read_json_object(obj):
    # Read significant keys
    word = obj.get("word")
    pos = obj.get("pos")
    senses = obj.get("senses")
    sounds = obj.get("sounds")
    translations = obj.get("translations")
    
    # Needed variables
    unique_definitions = None
    ipa = None
    mp3_url = None
    spanish_translation = None
    
    # Definitions
    if senses:
        definitions = list() # Save all definitions for the word
        for item in senses:
            if item.get('glosses'):
                definitions += item["glosses"]
        
        # Eliminate duplicate definitions
        unique_definitions = list(set(definitions))
                         
    # Audio file and IPA pronunciation
    if sounds:
        for item in sounds:
            if ipa and mp3_url:
                break
            
            if ipa == None and item.get('ipa'):
                ipa = item["ipa"]
                
            if mp3_url == None and item.get('mp3_url'):
                mp3_url = item["mp3_url"]
    
    # Spanish translation
    if translations:
        for item in translations:
            if spanish_translation:
                break
            
            if spanish_translation == None and item.get('lang'):
                if(item["lang"] == "Spanish"):
                    spanish_translation = item["word"]
                    
    # Add information to dictionary
    word_obj = {
        "word": word,
        "pos" : pos,
        "definitions" : unique_definitions,
        "ipa": ipa,
        "mp3_url" : mp3_url,
        "translation" : spanish_translation
    }
    
    return word_obj

In [43]:
def process_words_from_file(filename, word_dict, max_items=20):
    with open(filename, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_items:
                break

            if not line.strip():
                continue

            try:
                obj = json.loads(line)
                word_obj = read_json_object(obj)

                word = word_obj["word"]
                pos = word_obj["pos"]
                definitions = word_obj["definitions"]
                ipa = word_obj["ipa"]
                mp3_url = word_obj["mp3_url"]
                translation = word_obj["translation"]

                # Output
                # print(f"\n\nWord: {word}")
                # print(f"Pos: {pos}")
                # print("Definitions:")
                # for d in definitions:
                    # print(f"\t- {d}")
                # print("Sounds:")
                # print(f"\tipa: {ipa}")
                # print(f"\tmp3_url: {mp3_url}")
                # print(f"Translation (Spanish): {translation}")

                # Update dictionary
                if word not in word_dict:
                    word_dict[word] = {
                        "definitions": {pos: definitions},
                        "ipa": ipa,
                        "mp3_url": mp3_url,
                        "translation": translation,
                    }
                else:
                    if pos not in word_dict[word]["definitions"]:
                        word_dict[word]["definitions"][pos] = definitions

                    for key in ["ipa", "mp3_url", "translation"]:
                        if not word_dict[word][key] and word_obj[key]:
                            word_dict[word][key] = word_obj[key]

            except json.JSONDecodeError as e:
                print(f"Error decoding line {i + 1}: {e}")
                print(f"Line content: {line[:200]}")

In [44]:
filename = 'datasets/raw-wiktextract-data.jsonl' # Determine filename
word_dict = defaultdict(lambda: None)

process_words_from_file(filename, word_dict) # Dictionary to save resulting words

In [45]:
for word, attributes in word_dict.items():
    print(f"\n-------------- {word} -------------")
    for attribute, value in attributes.items():
        print(f"{attribute}:")
        if attribute == "definitions":
            for pos, glosses in value.items():
                print(f"   {pos}")
                for gloss in glosses:
                    print(f"     - {gloss}")
        else:
            print(value)


-------------- dictionary -------------
definitions:
   noun
     - A synchronic dictionary of a standardised language held to only contain words that are properly part of the language.
     - An associative array, a data structure where each value is referenced by a particular key, analogous to words and definitions in a dictionary (sense 1).
     - A reference work with a list of words from one or more languages, normally ordered alphabetically, explaining each word's meanings (senses), and sometimes also containing information on its etymology, pronunciation, usage, semantic relations, and translations, as well as other data.
     - Any work that has a list of material organized alphabetically; e.g., biographical dictionary, encyclopedic dictionary.
   verb
     - To look up in a dictionary.
     - To add to a dictionary.
     - To compile a dictionary.
ipa:
/ˈdɪk.ʃə.nə.ɹi/
mp3_url:
https://upload.wikimedia.org/wikipedia/commons/transcoded/1/1f/En-uk-dictionary.ogg/En-uk-dictionary