In [209]:
import json

In [210]:
category_keywords = {
"daily life": [
        "home", "food", "eat", "shop", "family", "house", "daily",
        "routine", "pet", "clean", "cook", "laundry", "grocery", 
        "neighborhood", "chores"],
"work": [
        "job", "employee", "office", "business", "company", "salary",
        "work", "meeting", "boss", "colleague", "promotion", "deadline", 
        "career", "contract"],
"social life": [
        "friend", "party", "relationship", "social", "talk", "meet",
        "chat", "hangout", "gathering", "date", "group", "celebration", 
        "invite", "interaction"],
"education": [
        "school", "university", "study", "learn", "education", "class",
        "teacher", "lecture", "exam", "homework", "degree", "student", 
        "notebook", "textbook"],
"travel": [
        "travel", "trip", "flight", "airport", "hotel", "tourism",
        "vacation", "luggage", "passport", "guide", "tour", "resort", 
        "itinerary", "destination"],
"health": [
        "hospital", "doctor", "medicine", "disease", "clinic", "illness",
        "treatment", "nurse", "surgery", "vaccine", "symptom", "checkup", 
        "appointment", "recovery"]
}


### How it works, example

The `classify_words` function analyzes a word's definitions and classifies
it into one or more predefined categories based on keyword matching.

Given a `text_data` like this:
```python
line_data = "To go on a journey or trip. To move from one place to another."
```
And `category_keywords`.

Calling the `classify_words` function **would return:**
- ["category01", "category02", "...", ...]


In [211]:
def classify_words(text_data: str, category_keywords: dict) -> list: 
    categories = []
    text = text_data.lower()
    
    for category, keywords in category_keywords.items():
        if any(keyword in text for keyword in keywords):
            categories.append(category)
            
    return categories if categories else ["other"]


In [None]:
# A dictionary to store the classified words
classified_words = {}

text = ""

with open("datasets/words-de02e1507605431abd5d829d7e868af5.jsonl", "r", 
        encoding="utf-8") as file:
    for line in file:
        word_entry = json.loads(line)
        for word, word_data in word_entry.items():
            all_texts = []
            text = " "
            # Extract the word and the value of the "definitions" key
            definitions = word_data.get("definitions", [])
            
            # Extract all the text values from the "definitions" key
            definition_texts = definitions[0].get("definitions", [])
                
            # Join both lists into a one list
            all_texts.extend(definition_texts)
                
            # Join all the texts into a single string
            text = " ".join(all_texts)    
            # Calling the classify_words function to classify the word
            category = classify_words(text, category_keywords)
            classified_words[word] = category
        
            
    for word, category in classified_words.items():
        print(f"{word}: {category}")

dictionary: ['work']
free: ['social life']
thesaurus: ['other']
encyclopedia: ['work']
portmanteau: ['daily life', 'education', 'travel']
cat: ['daily life', 'travel']
word: ['social life']
book: ['work', 'education']
pound: ['work']
GDP: ['other']
pond: ['other']
nonsense: ['education']
pie: ['other']
crow: ['daily life']
raven: ['daily life']
elephant: ['daily life']
brown: ['daily life']
December: ['other']
month: ['other']
January: ['other']
February: ['other']
march: ['daily life']
April: ['other']
may: ['other']
June: ['other']
July: ['other']
august: ['other']
September: ['other']
October: ['other']
November: ['other']
multiculturalism: ['social life']
day: ['work', 'education']
Monday: ['other']
Tuesday: ['other']
Wednesday: ['other']
Thursday: ['other']
Friday: ['work']
Saturday: ['other']
Sunday: ['travel']
lexicography: ['social life']
antonym: ['other']
connotation: ['other']
denotation: ['other']
synonym: ['other']
dialect: ['daily life', 'social life', 'education']
hypony