In this notebook we use `dictionaryapi` to find definitions for the newly added keywords in the [competition](https://www.kaggle.com/competitions/llm-20-questions)  
The goal is to use these definitions to:
1. Generate (good) questions with known answers for training purpuse
1. Understand better what can be keyword `things`, so we can generate more
1. Give more information to the agent with role `answerer` so it can respond more accurately (RAG ?)

results are store in `things_definitions.json`  
Note: 
- Some word have multiple definitions (Ex:`Plate` has many)
- For most of two or more parts words (Ex:`Air compressor`) dictionaryapi has no definition. Some two parts keyword have definitions (Ex:`Contact lenses`)
- Almost all one-word keywords have atleast one definition (exceptions: RV, IV,...)


In [None]:
%%capture
!pip install jq

In [None]:
import pandas as pd
import requests
import json
import time
import jq
import random

In [None]:
keywords = pd.read_csv("/kaggle/input/llm-20-questions-games-dataset/keywords.csv", index_col=0)
keywords

In [None]:
%run /kaggle/input/llm-20-questions/llm_20_questions/keywords.py
keywords_old:set[str] = {details["keyword"] for category in json.loads(KEYWORDS_JSON) for details in category["words"]}
# [w for w in keywords.loc[keywords.category=="place","keyword"] if w not in keywords_old]
keywords_place = set(keywords.loc[keywords.category=="place","keyword"])
print(f"Current place keyword not in the previous json: {keywords_place-keywords_old}")
print(f"Old place keyword not in current: {keywords_old-keywords_place}")
del keywords_place, keywords_old

# Things

In [None]:
keywords[keywords.category=="things"]

In [None]:
url_api = "https://api.dictionaryapi.dev/api/v2/entries/en/{word}"
# test api
r = requests.get(url_api.format(word="Zinc"))
r.status_code

In [None]:
r.json()

In [None]:
# Example jq to filter json
jq.compile('select(length > 0) []["meanings"]').input_value(r.json()).first()

## Alternative in bash
jq is command-line JSON processor

In [None]:
!curl "https://api.dictionaryapi.dev/api/v2/entries/en/{Wristband}" | jq '.[]["meanings"]'

In [None]:
!curl "https://api.dictionaryapi.dev/api/v2/entries/en/{Plate}" | jq '.[]'

In [None]:
keywords.loc[keywords.category=="things","keyword"].str.len().max()

In [None]:
# Definitions from previous version
with open("/kaggle/input/20questions-keywords-things-definitions/things_definitions.json","r") as f:
    keywords_definitions = json.load(f)

print(len(keywords_definitions), "things keywords total")
keywords_definitions = {k:v for k,v in keywords_definitions.items() if len(v)>0}
print(len(keywords_definitions) , "with found definitions")
next(iter(keywords_definitions.items()))

In [None]:
%%time
def definition_generator():
    fail=0
    for num, keyword in enumerate(keywords.loc[keywords.category=="things","keyword"]):
        r = requests.get(url_api.format(word=keyword))
        if keyword in keywords_definitions:
            yield keyword, keywords_definitions[keyword]
            continue
        if r.status_code != 200:
            print(f"Error {r.status_code=} {keyword = :30} failrate:{fail/num:.2%}")
            yield keyword, []
            fail+=1
            continue
        try:
            yield keyword, jq.compile('select(length > 0) []["meanings"]').input_value(r.json()).first()
        except (ValueError, JSONDecodeError) as e:
            print(f"{type(e).__name__} {e} parsing response {keyword = } failrate:{fail/num:.2%})")
            print(e)
            fail+=1
            yield keyword, []
        time.sleep(1+random.random()/2) #slow request to not be timeout 
    print(f"{fail=} ({fail/num:.2%})")

things_definitions={keyword: definitions for keyword, definitions in definition_generator()}
len(things_definitions)

# Result
& Store

In [None]:
with open("things_definitions.json","w") as f:
    json.dump(things_definitions,f)
things_definitions