In [None]:
%%bash

wget -O keywords.py https://raw.githubusercontent.com/Kaggle/kaggle-environments/master/kaggle_environments/envs/llm_20_questions/keywords.py

In [1]:
import json
import pandas as pd
import numpy as np


from keywords import KEYWORDS_JSON

def create_keyword_df(KEYWORDS_JSON):
    json_data = json.loads(KEYWORDS_JSON)

    keyword_list = []
    category_list = []
    alts_list = []

    for i in range(len(json_data)):
        for j in range(len(json_data[i]['words'])):
            keyword = json_data[i]['words'][j]['keyword']
            keyword_list.append(keyword)
            category_list.append(json_data[i]['category'])
            alts_list.append(json_data[i]['words'][j]['alts'])

    data_pd = pd.DataFrame(columns=['keyword', 'category', 'alts'])
    data_pd['keyword'] = keyword_list
    data_pd['category'] = category_list
    data_pd['alts'] = alts_list
    
    return data_pd


In [2]:
keyword_df = create_keyword_df(KEYWORDS_JSON)

place_list = keyword_df[keyword_df['category'] == 'place']['keyword'].to_list()
things_list = keyword_df[keyword_df['category'] == 'thing']['keyword'].to_list()

print(place_list[:5])

['afghanistan', 'albania', 'algeria', 'andorra', 'angola']


In [3]:
place_questions = [
    "is it a country?",
    "is it a city?",
    "is it a natural feature?",
]

things_questions = [
    "is it a living thing?",   
    "is it edible?",           
    "is it something that can be held in your hand?",
    "Does it require electricity to operate?",
    # "Would the keyword be included in the broad category of [Group]?",
]


In [4]:
from openai import OpenAI
import pandas as pd


class KeywordClassifier:
    def __init__(self, keywords: list, csv_path: str = None):
        self.client = OpenAI()
        self.keywords = keywords
        self.csv_path = csv_path
        if self.csv_path:
            self.df = pd.read_csv(csv_path)
        else:
            self.df = pd.DataFrame(index=keywords)
            self.csv_path = "mapped_keywords.csv"
            
        print("Keyword Classifier Initialized")
        self.preview()

    def _classify(self, question, keyword):
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant. Your goal is to classify the keyword to the question. Answer only with 'yes' or 'no'.",
                },
                {"role": "user", "content": f"Keyword: {keyword} \nQuestion: {question}"},
            ],
        )
        message = response.choices[0].message.content.lower()

        if "yes" in message:
            return "yes"
        else:
            return "no"
        
    def preview(self):
        print("Printing first 3 rows of the dataframe")
        print(self.df.head(3))
        
    def update(self, questions: list):
        added_question_count = 0
        for question in questions:
            if question in self.df.columns:
                continue
            
            added_question_count += 1
            
            for keyword in self.df.index:
                self.df.loc[keyword, question] = self._classify(question, keyword)
                    
        print(f"Added {added_question_count} questions")
        self.df.to_csv(self.csv_path)
        print(f"Saved to {self.csv_path}")
        

In [5]:
place_list = place_list[:3]
place_questions_first = place_questions[:2]
place_questions_second = place_questions[2:]

classifier = KeywordClassifier(keywords=place_list)

classifier.update(place_questions_first)
classifier.preview()

classifier.update(place_questions_second)
classifier.preview()

Keyword Classifier Initialized
Printing first 3 rows of the dataframe
Empty DataFrame
Columns: []
Index: [afghanistan, albania, algeria]
Added 2 questions
Saved to mapped_keywords.csv
Printing first 3 rows of the dataframe
            is it a country? is it a city?
afghanistan              yes            no
albania                  yes            no
algeria                  yes            no
Added 1 questions
Saved to mapped_keywords.csv
Printing first 3 rows of the dataframe
            is it a country? is it a city? is it a natural feature?
afghanistan              yes            no                       no
albania                  yes            no                       no
algeria                  yes            no                       no
