# Word processing

This feature is made to extract the relevant information from the words provided by the JSONL dump obtained from [WikiExtract](https://github.com/tatuylonen/wiktextract) and saves the words to JSON files.

The necessary information for a word is:

- Word in english 
- Part of speech (adjective, noun, etc.)
  - Definition for the word as the part of speech
- Audio file of the word's pronunciation
- Spanish translation(s) of the word
- IPA pronunciation

In [240]:
import json
from collections import defaultdict
import uuid

In [241]:
def print_word_dict(word_dict: dict):
    """Print information from word dictionary."""
    for word, attributes in word_dict.items():
        print(f"\n-------------- {word} -------------")
        for attribute, value in attributes.items():
            print(f"{attribute}:")
            if attribute == "definitions":
                for pos, glosses in value.items():
                    print(f"   {pos}")
                    for gloss in glosses:
                        print(f"     - {gloss}")
            else:
                print(value)

### 1. Extract word from json

This function takes a json object extracted when reading a non-empty line from the JSONL file and creates a dictionary with all the relevant information for a word.

The final dictionary has the following structure:

```
word_obj = {
        "word": word,
        "pos" : part_of_speech,
        "definitions" : unique_definitions,
        "ipa": ipa,
        "mp3_url" : mp3_url,
        "translation" : spanish_translation,
    }
```

In [242]:
def get_word_from_json(obj:dict)->dict:
    """ Extracts the relevant keys for a word from a JSON object. Returns a 
    dictionary. """
    
    # Check that the word is in english
    language = obj.get("lang")
    
    if language != "English":
        return None
    
    # Retrieve significant keys
    word = obj.get("word")
    part_of_speech = obj.get("pos")
    senses = obj.get("senses")
    sounds = obj.get("sounds")
    translations = obj.get("translations")
    
    # Needed variables
    unique_definitions = None
    ipa = None
    mp3_url = None
    spanish_translation = None
    
    # Definitions 
    # {senses: {glosses: x}}
    if senses:
        definitions = list() # Save all definitions for the word
        for item in senses:
            if item.get("glosses"):
                definitions += item["glosses"]

        # Eliminate duplicate definitions
        unique_definitions = list(set(definitions))
                         
    # Audio file and IPA pronunciation 
    # {sounds: {ipa: x, mp3_url: x}}
    if sounds:
        for item in sounds:
            if ipa and mp3_url:
                break
            
            if ipa == None and item.get("ipa"):
                ipa = item["ipa"]
                
            if mp3_url == None and item.get("mp3_url"):
                mp3_url = item["mp3_url"]
    
    # Spanish translation
    # {translation: {lang: x, word: x}}
    if translations:
        for item in translations:
            if spanish_translation:
                break
            
            if item.get("lang") and item.get("word"):
                if(item["lang"] == "Spanish"):
                    spanish_translation = item["word"]
                    
    # Add information to dictionary
    word_obj = {
        "word": word,
        "pos" : part_of_speech,
        "definitions" : unique_definitions,
        "ipa": ipa,
        "mp3_url" : mp3_url,
        "translation" : spanish_translation,
    }
    
    return word_obj

### 2. Add an extracted word to an accumulated words dictionary 

The JSONL dump provides multiple entries for the same word, distinguishing each by their role in the part of speech (noun, adjective, verb, etc).

To avoid having multiple entries of the same word, this function groups different entries for the same word. For every different part of speech, its corresponding definitions are saved under it.

The structure for the final dictionary is as follows:

```
word_dict = {
    word: {
        "definitions": {
            part_of_speech: [list of definitions]
        },
        "ipa": ipa,
        "mp3_url": mp3_url,
        "translations": [list of translations],
    }
}
```

In [243]:
def add_word_to_dict(word_obj:dict, word_dict:dict)->None:
    """
    Add an extracted word to a dictonary.
    """
    
    # Extract keys
    word = word_obj["word"]
    part_of_speech = word_obj["pos"]
    definitions = word_obj["definitions"]
    ipa = word_obj["ipa"]
    mp3_url = word_obj["mp3_url"]
    translation = word_obj["translation"]

    # Insert a new entry
    if word not in word_dict:
        word_dict[word] = {
            "definitions": {part_of_speech: definitions},
            "ipa": ipa,
            "mp3_url": mp3_url,
            "translations": [translation] if translation else None,
        }
        
    # Update an existing entry
    else:
        # Add translation
        if translation:
            if word_dict[word]["translations"] == None:
                word_dict[word]["translations"] = list()
                
            word_dict[word]["translations"].append(translation)
        
        # Add definitions for a new part of speech
        if part_of_speech and part_of_speech not in word_dict[word]["definitions"]:
            word_dict[word]["definitions"][part_of_speech] = definitions
        
        # Update information on empty keys
        for key in ["ipa", "mp3_url"]:
            if not word_dict[word][key] and word_obj[key]:
                word_dict[word][key] = word_obj[key]

### 3. Write the accumulated words dictionary to JSON file

This functions writes the accumulated words without repetition to a JSON file.

In [244]:
def dict_to_json(word_dict:dict)->None:
    """ 
    Write a dictionary to a JSON file. 
    """
    
    # Unique file name
    filename = f"datasets/word_files/words-{uuid.uuid4().hex}.json"

    # Serializing json
    json_object = json.dumps(word_dict, indent=4)
 
    # Writing to sample.json
    with open(filename, "w") as outfile:
        outfile.write(json_object)
    

### 4. Eliminate words with empty values from the accumulated words dictionary

This function takes the accumulated words dictionaries and eliminates all words that are missing any field.

In [245]:
def prune_word_dict(word_dict:dict)->None:
    """ 
    Eliminate words that have empty keys from the dictionary.
    """
    
    incomplete_words = list()
    
    # Save words with incomplete fields
    for word, info in word_dict.items():
        if None in info.values():
            incomplete_words.append(word)
          
    # Eliminate words from dictionary  
    for word in incomplete_words:
        word_dict.pop(word)

### 5. Read JSONL dump and create JSON files with words

This is the driver function for all the above functions. 

This function:
1. Reads the JSONL dump line by line until the specified amount of words has been reached.
2. Extracts the pertinent word information.
3. Adds the word to an accumulated words dictionary.
4. Once the amount of words reaches the specified 'words per JSON' limit, it saves the dictionary to a JSON file.
5. Returns the amount of words saved.


In [246]:
def process_words_from_file(filename:str, max_words:int=-1, 
                            batch_size:int=100)->int:
    """
    Extracts up to `max_words` entries from a JSONL file, saving them in 
    batches of `batch_size` entries per output JSON file.

    Returns:
        int: Total number of words successfully saved.
    """
    
    word_dict = defaultdict(lambda: None) # Dictionary to save resulting words
    word_count = 0
    
    with open(filename, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            # Read max num of words
            if max_words != -1 and word_count >= max_words: 
                break
            
            if line.strip(): 
                try:
                    obj = json.loads(line) # Read line as json object
                    word_obj = get_word_from_json(obj) 
                    
                    if word_obj != None: 
                        word = word_obj["word"]
                        
                        # Save dictionary to json file
                        # Check that there's no new info for existing word
                        if word not in word_dict and len(word_dict) >= batch_size:
                            prune_word_dict(word_dict) 
                            
                            # Check if still enough words
                            if len(word_dict) >= batch_size:
                                dict_to_json(word_dict)
                                word_count += len(word_dict)
                                word_dict.clear()
                        
                        add_word_to_dict(word_obj, word_dict)
                    
                except json.JSONDecodeError as e:
                    print(f"Error decoding line {i + 1}: {e}")
                    print(f"Line content: {line[:200]}")
                    continue
    
    # Check for remaining words          
    if(len(word_dict)):
        prune_word_dict(word_dict)
        
        if(len(word_dict) > 0):
            dict_to_json(word_dict)
            word_count += len(word_dict)
            word_dict.clear()
        
    return word_count

### 6. Call function

The call to the function should include the file from which to extract the words, as well as the maximum number of words to read and the amount of words to save per JSON. If no max_words is specified, it'll read the whole file.

In [247]:
filename = 'datasets/raw-wiktextract-data.jsonl' # Determine filename
count = process_words_from_file(filename, max_words=10000, batch_size=2000) 

In [248]:
print(f"Saved {count} words.")

Saved 10000 words.
