# 1. Bring API and LLM 20 Question Dataset

In [None]:
!pip install --upgrade kaggle

In [1]:
import os
import json
import shutil
import zipfile
import re

import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
# file upload로 가져온 json 파일 위치
dataset_path = '/kaggle/input/kaggle-json/kaggle.json'

# kaggle 디렉토리 생성
os.makedirs('/root/.kaggle', exist_ok=True)

# json 파일 이동
shutil.copy(dataset_path, '/root/.kaggle/kaggle.json')

# 권한 설정
os.chmod('/root/.kaggle/kaggle.json', 0o600)

# Kaggle API 인증 테스트
!kaggle datasets list

ref                                                    title                                        size  lastUpdated          downloadCount  voteCount  usabilityRating  
-----------------------------------------------------  ------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
waqi786/cats-dataset                                   🐾 Cats Dataset                                6KB  2024-07-31 06:12:49           1137         21  1.0              
patricklford/global-ev-sales-2010-2024                 Global EV Sales: 2010-2024                   83KB  2024-07-19 13:40:22           1986         29  1.0              
rabieelkharoua/students-performance-dataset            📚 Students Performance Dataset 📚             66KB  2024-06-12 23:09:20          26072        520  1.0              
ihelon/coffee-sales                                    Coffee Sales                                 13KB  2024-08-01 07:55:34           8986     

In [3]:
!kaggle competitions download -c llm-20-questions

Downloading llm-20-questions.zip to /kaggle/working
  0%|                                               | 0.00/10.9k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 10.9k/10.9k [00:00<00:00, 20.8MB/s]


In [4]:
# 압축 파일 경로
zip_file_path = '/kaggle/working/llm-20-questions.zip'
extract_path = '/kaggle/working/'

# 압축 파일 해제
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 모든 파일 경로 출력
for root, dirs, files in os.walk(extract_path):
    for file in files:
        print(os.path.join(root, file))

/kaggle/working/llm-20-questions.zip
/kaggle/working/llm_20_questions/llm_20_questions.py
/kaggle/working/llm_20_questions/llm_20_questions.json
/kaggle/working/llm_20_questions/keywords.py
/kaggle/working/llm_20_questions/llm_20_questions.js


# 2. EDA

## 1. Keywords.py file

In [6]:
with zipfile.ZipFile('/kaggle/working/llm-20-questions.zip', 'r') as zip_ref:
    zip_ref.printdir()

File Name                                             Modified             Size
llm_20_questions/keywords.py                   2024-05-14 21:09:58        45282
llm_20_questions/llm_20_questions.js           2024-05-14 21:09:58         5923
llm_20_questions/llm_20_questions.json         2024-05-14 21:09:58         2101
llm_20_questions/llm_20_questions.py           2024-05-14 21:09:58         9073


In [9]:
keywords_path = '/kaggle/working/llm_20_questions'

In [10]:
with open('/kaggle/working/llm_20_questions/keywords.py', 'r') as file:
    print(file.read()[:300])

"""List of keywords for 20 Questions."""

KEYWORDS_JSON = """
[
  {
    "category": "country",
    "words": [
      {
        "keyword": "afghanistan",
        "alts": []
      },
      {
        "keyword": "albania",
        "alts": []
      },
      {
        "keyword": "algeria",
        "alts": 


In [11]:
# 카테고리의 종류와 개수, 각 카테고리별 키워드의 개수 파악하기
# 파일에서 JSON 문자열 추출
with open('/kaggle/working/llm_20_questions/keywords.py', 'r') as file:
    content = file.read()

# JSON 문자열 추출 (정규 표현식 사용, """ 대신 """)
json_match = re.search(r'KEYWORDS_JSON\s*=\s*("""[\s\S]*?"""|"""[\s\S]*?"""|[\s\S]*?)\n', content)
if json_match:
    json_str = json_match.group(1)
    # """ 제거
    json_str = json_str.strip('"""').strip("'''").strip()
    # 파싱 가능한 형식으로 수정
    json_str = json_str.replace('\n', '').replace('  ', '')

# JSON 문자열 파싱
if 'json_str' in locals():
    keywords_data = json.loads(json_str)

    # 카테고리 개수
    category_count = len(keywords_data)
    print(f"Number of categories: {category_count}")

    # 카테고리별 이름과 키워드 개수
    category_info = []
    for category in keywords_data:
        category_name = category['category']
        keyword_count = len(category['words'])
        category_info.append((category_name, keyword_count))

    # 데이터 프레임 생성
    df = pd.DataFrame(category_info, columns=['Category', 'Number of keywords'])

    # 데이터 프레임 출력
    print(df)
else:
    print("JSON string could not be found.")

Number of categories: 3
   Category  Number of keywords
0   country                 190
1      city                 324
2  landmark                  49


In [12]:
# 파일 경로 설정
file_path = '/kaggle/working/llm_20_questions/llm_20_questions.js'

# 파일 내용 읽기 및 문자 수 카운트
with open(file_path, 'r') as file:
    content = file.read()
    char_count = len(content)

print(f"The file '{file_path}' has {char_count} characters.")

The file '/kaggle/working/llm_20_questions/llm_20_questions.js' has 5923 characters.


In [13]:
with open('/kaggle/working/llm_20_questions/llm_20_questions.js', 'r') as file:
    print(file.read()[:])

async function renderer(context) {
    const {
        act,
        agents,
        environment,
        frame,
        height = 800,
        interactive,
        isInteractive,
        parent,
        step,
        update,
        width = 1200,
    } = context;

    // Common Dimensions.
    const maxWidth = 1200;
    const maxHeight = 800;
    const canvasSize = Math.min(height, width);
    const unit = 8;
    const offset = canvasSize > 400 ? canvasSize * 0.1 : unit / 2;
    const cellSize = (canvasSize - offset * 2) / 3;

    // Canvas Setup.
    let canvas = parent.querySelector("canvas");
    if (!canvas) {
        canvas = document.createElement("canvas");
        parent.appendChild(canvas);

        if (interactive) {
            canvas.addEventListener("click", evt => {
                if (!isInteractive()) return;
                const rect = evt.target.getBoundingClientRect();
                const x = evt.clientX - rect.left - offset;
                const y = evt.clientY -

In [14]:
# 파일 경로 설정
file_path = '/kaggle/working/llm_20_questions/llm_20_questions.json'

# 파일 내용 읽기 및 문자 수 카운트
with open(file_path, 'r') as file:
    content = file.read()
    char_count = len(content)

print(f"The file '{file_path}' has {char_count} characters.")

The file '/kaggle/working/llm_20_questions/llm_20_questions.json' has 2101 characters.


In [15]:
with open('/kaggle/working/llm_20_questions/llm_20_questions.json', 'r') as file:
    print(file.read()[:])

{
    "name": "llm_20_questions",
    "title": "20 Questions",
    "description": "20 Questions played between two LLM agents",
    "version": "1.0.0",
    "agents": [4],
    "configuration": {
      "episodeSteps": 61,
      "actTimeout": 60,
      "runTimeout": 9600,
      "agentTimeout": {
        "description": "Obsolete field kept for backwards compatibility, please use observation.remainingOverageTime.",
        "type": "number",
        "minimum": 0,
        "default": 3600
      }
    },
    "reward": {
      "description": "1-20 = Won, -1 = Lost",
      "enum": [-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
      "defaults": [0, 0, 0, 0]
    },
    "observation": {
      "questions": {
        "description": "Questions the guessing agent has asked.",
        "type": "array",
        "default": []
      },
      "guesses": {
        "description": "Guesses the guessing agent has made.",
        "type": "array",
        "default": []
      },
   

In [None]:
# 파일 경로 설정
file_path = '/kaggle/working/llm_20_questions/llm_20_questions.py'

# 파일 내용 읽기 및 문자 수 카운트
with open(file_path, 'r') as file:
    content = file.read()
    char_count = len(content)

print(f"The file '{file_path}' has {char_count} characters.")

In [None]:
with open('/kaggle/working/llm_20_questions/llm_20_questions.py', 'r') as file:
    print(file.read()[:500])