In [17]:
import json

In [None]:
category_keywords = {
        "daily life": ["home", "food", "eat", "shop", "family", "house", 
        "daily","routine", "pet"],
        "work": ["job", "employee", "office", "business", "company", "salary",
        "work"],
        "social life": ["friend", "party", "relationship", "social", "talk", 
        "meet", "chat"],
        "education": ["school", "university", "study", "learn", "education", 
        "class", "teacher"],
        "travel": ["travel", "trip", "flight", "airport", "hotel", "tourism",
        "vacation"],
        "health": ["hospital", "doctor", "medicine", "disease", "clinic"],
}

### How it works, example

The `classify_words` function analyzes a word's definitions and classifies
it into one or more predefined categories based on keyword matching.

Given a `line_data` like this:
```python
line_data = {
    "definitions": [
        {"definitions": ["To go on a journey or trip.", "To move from one place to another."]}
    ]
}
```
And `category_keywords`.

Calling the `classify_words` function **would return:**
- ["category"]


In [None]:
def classify_words(line_data: dict, category_keywords: list) -> list: 
    """ Classify words based on their definitions.

    Args:
        line_data (dict): JSON Line data containing information about
        the words.
        category_keywords (list): a list of keywords for each category.

    Returns:
        list: A list of categories that the words belong to.
    """    
    categories = []
    # Save the value of the "definitions" key from the line_data dictionary
    definitions = line_data.get("definitions", [])
    # Save the definitions in a list
    definitions_texts = []
    
    # Iterate through the definitions and extract the text
    for entry in definitions:
        for definition in entry.get("definitions", []):
            definitions_texts.append(definition.lower())
            
    # Check if any of the keywords are present in the definitions
    for category, keywords in category_keywords.items():
        if any(keyword in definition for definition in definitions_texts
            for keyword in keywords):
            categories.append(category)
    
    # Check if the categories list is empty
    if categories:
        return categories
    else:
        return ["other"]


In [None]:
# A dictionary to store the classified words
classified_words = {}

with open("datasets/words-de02e1507605431abd5d829d7e868af5.jsonl", "r", 
        encoding="utf-8") as file:
    for line in file:
        word_entry = json.loads(line)
        for word, word_data in word_entry.items():
            category = classify_words(word_data, category_keywords)
            classified_words[word] = category
            
for word, category in classified_words.items():
    print(f"{word}: {category}")


dictionary: ['work']
free: ['other']
thesaurus: ['other']
encyclopedia: ['work']
portmanteau: ['daily life', 'education', 'travel']
cat: ['daily life', 'travel']
word: ['other']
book: ['work', 'education', 'travel']
pound: ['daily life', 'work']
GDP: ['other']
pond: ['other']
nonsense: ['other']
pie: ['other']
crow: ['daily life']
raven: ['daily life']
elephant: ['daily life']
brown: ['daily life']
December: ['other']
month: ['other']
January: ['other']
February: ['other']
march: ['daily life']
April: ['other']
may: ['other']
June: ['other']
July: ['other']
august: ['other']
September: ['other']
October: ['other']
November: ['other']
multiculturalism: ['social life']
day: ['work', 'education']
Monday: ['other']
Tuesday: ['other']
Wednesday: ['other']
Thursday: ['other']
Friday: ['work']
Saturday: ['other']
Sunday: ['travel']
lexicography: ['social life']
antonym: ['other']
connotation: ['other']
denotation: ['other']
synonym: ['other']
dialect: ['daily life', 'social life']
hyponym: ['