In [None]:
%%bash

wget -O keywords.py https://raw.githubusercontent.com/Kaggle/kaggle-environments/master/kaggle_environments/envs/llm_20_questions/keywords.py

In [4]:
import json
import pandas as pd
import numpy as np


from keywords import KEYWORDS_JSON

def create_keyword_df(KEYWORDS_JSON):
    json_data = json.loads(KEYWORDS_JSON)

    keyword_list = []
    category_list = []
    alts_list = []

    for i in range(len(json_data)):
        for j in range(len(json_data[i]['words'])):
            keyword = json_data[i]['words'][j]['keyword']
            keyword_list.append(keyword)
            category_list.append(json_data[i]['category'])
            alts_list.append(json_data[i]['words'][j]['alts'])

    data_pd = pd.DataFrame(columns=['keyword', 'category', 'alts'])
    data_pd['keyword'] = keyword_list
    data_pd['category'] = category_list
    data_pd['alts'] = alts_list
    
    return data_pd


In [5]:
keyword_df = create_keyword_df(KEYWORDS_JSON)

place_list = keyword_df[keyword_df['category'] == 'place']['keyword'].to_list()
things_list = keyword_df[keyword_df['category'] == 'things']['keyword'].to_list()

print(things_list[:5])
print(len(things_list))

['Advertisement', 'Agave', 'Air compressor', 'Air Conditioner', 'Air filter']
579


In [6]:
things_questions = [
    "Is the thing related to food or drink in any way?",
    "Would the keyword be included in the broad category of Machines?",
    "Is it tangible?",
    "Is it water-based?",
    "is it a living thing?",   
    "Would the keyword be considered a Home appliance?",
    
]

In [7]:
from openai import OpenAI
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm


class KeywordClassifier:
    def __init__(self, keywords: list, csv_path: str = None):
        load_dotenv()
        self.client = OpenAI()
        self.keywords = keywords
        self.csv_path = csv_path
        if self.csv_path:
            self.df = pd.read_csv(csv_path)
        else:
            self.df = pd.DataFrame(index=keywords)
            self.csv_path = "mapped_keywords.csv"
            
        print("Keyword Classifier Initialized")
        self.preview()

    def _classify(self, question, keyword):
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant. Your goal is to classify the keyword to the question. Answer only with 'yes' or 'no'.",
                },
                {"role": "user", "content": f"Keyword: {keyword} \nQuestion: {question}"},
            ],
        )
        message = response.choices[0].message.content.lower()

        if "yes" in message:
            return "yes"
        else:
            return "no"
        
    def preview(self):
        print("Printing first 3 rows of the dataframe")
        print(self.df.head(3))
        

    def update(self, questions: list):
        added_question_count = 0
        for question in tqdm(questions, desc="Processing questions"):
            if question in self.df.columns:
                continue

            added_question_count += 1

            for keyword in tqdm(self.df.index, desc="Classifying keywords"):
                self.df.loc[keyword, question] = self._classify(question, keyword)

            self.df.to_csv(self.csv_path)
            
        print(f"Added {added_question_count} questions")
        print(f"Saved to {self.csv_path}")
        

In [8]:
print(len(things_list))

579


In [9]:
classifier = KeywordClassifier(keywords=things_list)

classifier.update(things_questions)
classifier.preview()

Keyword Classifier Initialized
Printing first 3 rows of the dataframe
Empty DataFrame
Columns: []
Index: [Advertisement, Agave, Air compressor]


Classifying keywords: 100%|██████████| 579/579 [00:00<00:00, 2394972.40it/s]
Classifying keywords: 100%|██████████| 579/579 [00:00<00:00, 4321177.96it/s]
Classifying keywords: 100%|██████████| 579/579 [00:00<00:00, 4752450.13it/s]
Classifying keywords: 100%|██████████| 579/579 [00:00<00:00, 3308585.85it/s]
Classifying keywords: 100%|██████████| 579/579 [00:00<00:00, 4283072.34it/s]
Classifying keywords: 100%|██████████| 579/579 [00:00<00:00, 2894519.69it/s]
Processing questions: 100%|██████████| 6/6 [00:00<00:00, 237.90it/s]

Added 6 questions
Saved to mapped_keywords.csv
Printing first 3 rows of the dataframe
Empty DataFrame
Columns: []
Index: [Advertisement, Agave, Air compressor]





In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Load the data
df = pd.read_csv('mapped_keywords.csv', index_col=0)

In [None]:
df.head(3)

In [None]:
percentages = {}
for column in df.columns:
    percentages[column] = df[column].value_counts(normalize=True).get('yes', 0) * 100

# Plotting
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))  # Adjust the size as needed
axes = axes.flatten()  # Flatten the array to make iteration easier

for ax, (column, percentage) in zip(axes, percentages.items()):
    ax.bar(['Yes', 'No'], [percentage, 100-percentage], color=['blue', 'red'])
    ax.set_title(f'Percentage of Yes in {column}')
    ax.set_ylabel('Percentage')
    ax.set_ylim(0, 100)  # Set y-axis limits to 0-100 for percentage view
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()