# Word processing

This feature is made to extract the relevant information about words from the JSONL file provided.
The final features to be obtained will be:
- English word 
- Type of word (adjective, noun, etc.)
  - Definition
- Audio file
- Spanish translation
- IPA pronunciation

In [15]:
import json
from collections import defaultdict
import datetime

In [13]:
def read_json_object(obj:dict)->dict:
    """ Extract relevant keys for a word from a JSON object. """
    
    # Retrieve significant keys
    word = obj.get("word")
    part_of_speech = obj.get("pos")
    senses = obj.get("senses")
    sounds = obj.get("sounds")
    translations = obj.get("translations")
    
    # Needed variables
    unique_definitions = None
    ipa = None
    mp3_url = None
    spanish_translation = None
    
    # Definitions 
    # {senses: {glosses: x}}
    if senses:
        definitions = list() # Save all definitions for the word
        for item in senses:
            if item.get('glosses'):
                definitions += item["glosses"]

        # Eliminate duplicate definitions
        unique_definitions = list(set(definitions))
                         
    # Audio file and IPA pronunciation 
    # {sounds: {ipa: x, mp3_url: x}}
    if sounds:
        for item in sounds:
            if ipa and mp3_url:
                break
            
            if ipa == None and item.get('ipa'):
                ipa = item["ipa"]
                
            if mp3_url == None and item.get('mp3_url'):
                mp3_url = item["mp3_url"]
    
    # Spanish translation
    # {translation: {lang: x, word: x}}
    if translations:
        for item in translations:
            if spanish_translation:
                break
            
            if item.get('lang'):
                if(item["lang"] == "Spanish"):
                    spanish_translation = item["word"]
                    
    # Add information to dictionary
    word_obj = {
        "word": word,
        "pos" : part_of_speech,
        "definitions" : unique_definitions,
        "ipa": ipa,
        "mp3_url" : mp3_url,
        "translation" : spanish_translation,
    }
    
    return word_obj

In [None]:
def add_word_to_dict(word_obj:dict, word_dict:dict)->None:
    """Add an extracted word to a dictonary."""
    
    # Extract keys
    word = word_obj["word"]
    part_of_speech = word_obj["pos"]
    definitions = word_obj["definitions"]
    ipa = word_obj["ipa"]
    mp3_url = word_obj["mp3_url"]
    translation = word_obj["translation"]

    # Insert a new entry
    if word not in word_dict:
        word_dict[word] = {
            "definitions": {part_of_speech: definitions},
            "ipa": ipa,
            "mp3_url": mp3_url,
            "translation": translation,
        }
        
    # Update an existing entry
    else:
        # Add definitions for a new part of speech
        if part_of_speech not in word_dict[word]["definitions"]:
            word_dict[word]["definitions"][part_of_speech] = definitions
        
        # Update information on empty keys
        for key in ["ipa", "mp3_url", "translation"]:
            if not word_dict[word][key] and word_obj[key]:
                word_dict[word][key] = word_obj[key]

In [None]:
def dict_to_json(word_dict:dict)->None:
    """ Write dictionary to a JSON file. """
    
    
    
    filename = f"datasets/words-{datetime()}.json"

    # Serializing json
    json_object = json.dumps(word_dict, indent=4)
 
    # Writing to sample.json
    with open(filename, "w") as outfile:
        outfile.write(json_object)
    

In [None]:
def process_words_from_file(filename:str, word_dict:dict, max_items:int=5, 
                            batch_size:int=100)->None:
    """ Extract word object and add it to a dictionary. """
    
    with open(filename, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            # Read max num of items
            if i >= max_items:
                break

            # Skip a blank line
            if not line.strip():
                continue

            try:
                obj = json.loads(line)
                word_obj = read_json_object(obj) # Retrieve object
                add_word_to_dict(word_obj, word_dict)
                
                if(len(word_dict) >= batch_size):
                    dict_to_json(word_dict)

            except json.JSONDecodeError as e:
                print(f"Error decoding line {i + 1}: {e}")
                print(f"Line content: {line[:200]}")

In [10]:
filename = 'datasets/raw-wiktextract-data.jsonl' # Determine filename
word_dict = defaultdict(lambda: None)

process_words_from_file(filename, word_dict, 3) # Dictionary to save resulting words

<class 'dict'>
<class 'dict'>
<class 'dict'>


In [6]:
for word, attributes in word_dict.items():
    print(f"\n-------------- {word} -------------")
    for attribute, value in attributes.items():
        print(f"{attribute}:")
        if attribute == "definitions":
            for pos, glosses in value.items():
                print(f"   {pos}")
                for gloss in glosses:
                    print(f"     - {gloss}")
        else:
            print(value)


-------------- dictionary -------------
definitions:
   noun
     - A synchronic dictionary of a standardised language held to only contain words that are properly part of the language.
     - An associative array, a data structure where each value is referenced by a particular key, analogous to words and definitions in a dictionary (sense 1).
     - A reference work with a list of words from one or more languages, normally ordered alphabetically, explaining each word's meanings (senses), and sometimes also containing information on its etymology, pronunciation, usage, semantic relations, and translations, as well as other data.
     - Any work that has a list of material organized alphabetically; e.g., biographical dictionary, encyclopedic dictionary.
   verb
     - To look up in a dictionary.
     - To compile a dictionary.
     - To add to a dictionary.
ipa:
/ˈdɪk.ʃə.nə.ɹi/
mp3_url:
https://upload.wikimedia.org/wikipedia/commons/transcoded/1/1f/En-uk-dictionary.ogg/En-uk-dictionary