# Word processing

This feature is made to extract the relevant information about words from the JSONL file provided.
The final features to be obtained will be:
- English word 
- Type of word (adjective, noun, etc.)
  - Definition
- Audio file
- Spanish translation
- IPA pronunciation

In [231]:
import json
from collections import defaultdict
import uuid

In [232]:
def print_dict(word_dict: dict):
    for word, attributes in word_dict.items():
        print(f"\n-------------- {word} -------------")
        for attribute, value in attributes.items():
            print(f"{attribute}:")
            if attribute == "definitions":
                for pos, glosses in value.items():
                    print(f"   {pos}")
                    for gloss in glosses:
                        print(f"     - {gloss}")
            else:
                print(value)

In [233]:
def get_word_from_json(obj:dict)->dict:
    """ Extract relevant keys for a word from a JSON object. """
    
    # Check that the word is in english
    language = obj.get("lang")
    
    if language != "English":
        return None
    
    # Retrieve significant keys
    word = obj.get("word")
    part_of_speech = obj.get("pos")
    senses = obj.get("senses")
    sounds = obj.get("sounds")
    translations = obj.get("translations")
    
    # Needed variables
    unique_definitions = None
    ipa = None
    mp3_url = None
    spanish_translation = None
    
    # Definitions 
    # {senses: {glosses: x}}
    if senses:
        definitions = list() # Save all definitions for the word
        for item in senses:
            if item.get("glosses"):
                definitions += item["glosses"]

        # Eliminate duplicate definitions
        unique_definitions = list(set(definitions))
                         
    # Audio file and IPA pronunciation 
    # {sounds: {ipa: x, mp3_url: x}}
    if sounds:
        for item in sounds:
            if ipa and mp3_url:
                break
            
            if ipa == None and item.get("ipa"):
                ipa = item["ipa"]
                
            if mp3_url == None and item.get("mp3_url"):
                mp3_url = item["mp3_url"]
    
    # Spanish translation
    # {translation: {lang: x, word: x}}
    if translations:
        for item in translations:
            if spanish_translation:
                break
            
            if item.get("lang") and item.get("word"):
                if(item["lang"] == "Spanish"):
                    spanish_translation = item["word"]
                    
    # Add information to dictionary
    word_obj = {
        "word": word,
        "pos" : part_of_speech,
        "definitions" : unique_definitions,
        "ipa": ipa,
        "mp3_url" : mp3_url,
        "translation" : spanish_translation,
    }
    
    return word_obj

In [234]:
def add_word_to_dict(word_obj:dict, word_dict:dict)->None:
    """Add an extracted word to a dictonary."""
    
    # Extract keys
    word = word_obj["word"]
    part_of_speech = word_obj["pos"]
    definitions = word_obj["definitions"]
    ipa = word_obj["ipa"]
    mp3_url = word_obj["mp3_url"]
    translation = word_obj["translation"]

    # Insert a new entry
    if word not in word_dict:
        word_dict[word] = {
            "definitions": {part_of_speech: definitions},
            "ipa": ipa,
            "mp3_url": mp3_url,
            "translations": [translation] if translation else None,
        }
        
    # Update an existing entry
    else:
        # Add translation
        if translation:
            if word_dict[word]["translations"] == None:
                word_dict[word]["translations"] = list()
                
            word_dict[word]["translations"].append(translation)
        
        # Add definitions for a new part of speech
        if part_of_speech and part_of_speech not in word_dict[word]["definitions"]:
            word_dict[word]["definitions"][part_of_speech] = definitions
        
        # Update information on empty keys
        for key in ["ipa", "mp3_url"]:
            if not word_dict[word][key] and word_obj[key]:
                word_dict[word][key] = word_obj[key]

In [235]:
def dict_to_json(word_dict:dict)->None:
    """ Write dictionary to a JSON file. """
    
    # Unique file name
    filename = f"datasets/word_files/words-{uuid.uuid4().hex}.json"

    # Serializing json
    json_object = json.dumps(word_dict, indent=4)
 
    # Writing to sample.json
    with open(filename, "w") as outfile:
        outfile.write(json_object)
    

In [236]:
def prune_word_dict(word_dict:dict):
    """ Eliminate words that have empty keys."""
    
    incomplete_words = list()
    
    for word, info in word_dict.items():
        if None in info.values():
            incomplete_words.append(word)
            
    for word in incomplete_words:
        word_dict.pop(word)

In [None]:
def process_words_from_file(filename:str, max_words:int=-1, 
                            batch_size:int=100)->int:
    """ Extracts a specified amount of words (max_words) from JSONL file dump 
    and saves them to one or more JSON files with at most batch_size words."""
    
    word_dict = defaultdict(lambda: None) # Dictionary to save resulting words
    word_count = 0
    
    with open(filename, "r", encoding="utf-8") as f:
        # Read file line by line
        for i, line in enumerate(f):
            # Read max num of words
            if max_words != -1 and word_count >= max_words:
                break

            # Skip empty lines
            if not line.strip():
                continue

            try:
                obj = json.loads(line)
                word_obj = get_word_from_json(obj)
                
                if word_obj == None:
                    continue
                
                word = word_obj["word"]
                
                # Save dictionary to json file
                # Check that there's no new info for existing word
                if word not in word_dict and len(word_dict) >= batch_size:
                    prune_word_dict(word_dict) # Delete incomplete words
                    
                    if len(word_dict) >= batch_size:
                        dict_to_json(word_dict)
                        word_count += len(word_dict)
                        word_dict.clear()
                
                add_word_to_dict(word_obj, word_dict)
                
            except json.JSONDecodeError as e:
                print(f"Error decoding line {i + 1}: {e}")
                print(f"Line content: {line[:200]}")
    
    # Check for remaining words          
    if(len(word_dict)):
        prune_word_dict(word_dict)
        
        if(len(word_dict) > 0):
            dict_to_json(word_dict)
            word_count += len(word_dict)
            word_dict.clear()
        
    return word_count

In [238]:
filename = 'datasets/raw-wiktextract-data.jsonl' # Determine filename
count = process_words_from_file(filename, max_words=10000, batch_size=2000) 

In [239]:
print(f"Saved {count} words.")

Saved 10000 words.
