In [2]:
import json
from transformers import pipeline

In [13]:
category_labels = {
    "daily life": "Situations related to everyday routines like eating, sleeping, shopping, hygiene, and home activities.",
    "work": "Concepts involving employment, jobs, office tasks, careers, or professional environments.",
    "social life": "Scenarios involving friends, communication, entertainment, community, and social interaction.",
    "education": "Topics involving learning, studying, schools, teaching, or acquiring knowledge.",
    "travel": "Experiences involving transportation, visiting places, vacations, tourism, and moving between locations.",
    "health": "Concepts related to physical or mental well-being, medicine, fitness, nutrition, and medical care."
}

In [14]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
print("Model loaded.")

Device set to use cpu


Model loaded.


In [None]:
def classify_words(text_data: str, category_labels: dict, classifier):
    categories = {}
    text = text_data.lower()
    label_descriptions = list(category_labels.values())

    result = classifier(text, label_descriptions)

    description_to_category = {v: k for k, v in category_labels.items()}

    for label, score in zip(result["labels"], result["scores"]):
        category = description_to_category[label]
        categories[category] = score
    return categories

In [None]:
def definitions_to_string(jsonl_data:str, classifier, model) -> None:
    # A dictionary to store the classified words
    word_categories = {}

    text = ""
    with open(jsonl_data, "r",
        encoding="utf-8") as file:
        for line in file:
            word_entry = json.loads(line)
            for word, word_data in word_entry.items():
                all_texts = []
                text = " "
                # Extract the word and the value of the "definitions" key
                definitions = word_data.get("definitions", [])

                # Extract all the text values from the "definitions" key
                definition_texts = definitions[0].get("definitions", [])

                # Join both lists into a one list
                all_texts.extend(definition_texts)

                # Join all the texts into a single string
                text = " ".join(all_texts)
                # Calling the classify_words function to classify the word
                categories = classifier(text, category_labels, model)
                word_categories[word] = categories


        for word, categories in word_categories.items():
            print(f"{word}:")
            for category, score in categories.items():
                print(f"{word}: {category} ({score:.2%})")

In [None]:
jsonl_data = "words-de02e1507605431abd5d829d7e868af5.jsonl"
definitions_to_string(jsonl_data, classify_words, classifier)