In [37]:
import requests as req
import duckdb
import pandas as pd
from datetime import datetime, timedelta
import time
from enum import Enum
from typing import Optional, Tuple, List
import json


In [56]:
def get_jeopardy_game(month, day, year, max_retries=3, base_delay=1):
    game_data = []
    failed_dates = []
    month_str = str(month).zfill(2)
    day_str = str(day).zfill(2)

    for round_num in range(1, 3):
        url = f"https://jarchive-json.glitch.me/glitch/{month_str}/{day_str}/{year}/{round_num}"
        
        for attempt in range(max_retries):
            try:
                response = req.get(url, timeout=10)
                response.raise_for_status()
                
                # Check for "no game" message in response
                data = response.json()
                if isinstance(data, dict) and "message" in data:
                    if "does not have a game" in data["message"]:
                        return None  # Indicate no game available
                
                game_data.append(data)
                break
                
            except req.exceptions.RequestException as e:
                delay = base_delay * (2 ** attempt)
                if attempt < max_retries - 1:
                    print(f"Attempt {attempt + 1} failed for round {round_num}. Retrying in {delay} seconds...")
                    time.sleep(delay)
                else:
                    failed_dates.append(f"{year}-{month_str}-{day_str}")
        
    if not game_data:
        raise Exception("Failed to fetch any game data")
        
    return game_data, failed_dates

In [47]:
def store_jeopardy_game(month, day, year):
    # Fetch the game data
    game_data = get_jeopardy_game(month, day, year)
    
    # Transform data into a flat structure
    flattened_data = []
    game_date = datetime(year, month, day).date()  # Convert to date instead of datetime
    
    for round_num, round_data in enumerate(game_data, 1):
        for category, clues in round_data.items():
            for clue in clues:
                flattened_data.append({
                    'game_date': game_date,
                    'round': round_num,
                    'category': category,
                    'clue': clue['clue'],
                    'answer': clue['answer']
                })
    
    # Convert to DataFrame
    df = pd.DataFrame(flattened_data)
    
    # Initialize DuckDB and create table if it doesn't exist
    conn = duckdb.connect('jeopardy.db')
    
    # Create table if it doesn't exist
    conn.execute("""
        CREATE TABLE IF NOT EXISTS jeopardy_clues (
            game_date DATE,
            round INTEGER,
            category VARCHAR,
            clue VARCHAR,
            answer VARCHAR,
            main_category VARCHAR,
            suggested_category VARCHAR,
            PRIMARY KEY (game_date, round, category, clue)
        )
    """)
    
    # Convert DataFrame to DuckDB table and insert
    conn.execute("INSERT OR IGNORE INTO jeopardy_clues SELECT * FROM df")
    conn.commit()  # Add explicit commit
    conn.close()
    
    return len(flattened_data)

In [51]:
def store_jeopardy_games_range(start_date, end_date):
    current_date = start_date
    total_clues = 0
    
    while current_date <= end_date:
        try:
            clues_stored = store_jeopardy_game(
                current_date.month,
                current_date.day,
                current_date.year
            )
            print(f"Stored {clues_stored} clues for {current_date.date()}")
            total_clues += clues_stored
            
        except Exception as e:
            print(f"Failed to fetch/store game for {current_date.date()}: {str(e)}")
            
        current_date += timedelta(days=1)
    
    return total_clues


Stored 60 clues for 2023-10-23
Stored 60 clues for 2023-10-24
Stored 60 clues for 2023-10-25
Stored total of 180 clues


In [None]:
# Example usage:
start = datetime(2023, 10, 23)
end = datetime(2023, 10, 25)
total = store_jeopardy_games_range(start, end)
print(f"Stored total of {total} clues")

In [50]:
# Query example
conn = duckdb.connect('jeopardy.db')
result = conn.execute("""
    SELECT * 
    FROM jeopardy_clues 
    WHERE game_date = DATE '2023-10-25'
    LIMIT 5
""").fetchall()
conn.close()
result

[(datetime.date(2023, 10, 25),
  1,
  'PRODUCE',
  'Big on vitamins A & C, this leafy vegetable also has a lot of fiber & acts as a mild laxative; look out, sailor man!',
  'spinach',
  None,
  None),
 (datetime.date(2023, 10, 25),
  1,
  'PRODUCE',
  'In 2008 a pair of Brits were out of their gourd, growing a 1,457-lb. one of these; smashing!',
  'a pumpkin',
  None,
  None),
 (datetime.date(2023, 10, 25),
  1,
  'PRODUCE',
  'Dr. Thomas Welch is credited with making the first unfermented juice from this fruit',
  'a grape',
  None,
  None),
 (datetime.date(2023, 10, 25),
  1,
  'PRODUCE',
  'When life gives you the Meyer type of this tree, expect plenty of fruit year-round',
  'a lemon',
  None,
  None),
 (datetime.date(2023, 10, 25),
  1,
  'PRODUCE',
  'The name of this cylindrical vegetable means "little squashes" in Italian',
  'zucchini',
  None,
  None)]

In [38]:
class Category(Enum):
    GEOGRAPHY = "geography"
    HISTORY = "history"
    LITERATURE = "literature"
    SCIENCE = "science"
    ARTS = "arts"
    ENTERTAINMENT = "entertainment"
    SPORTS = "sports"
    FOOD_DRINK = "food_and_drink"
    LANGUAGE = "language"
    RELIGION_MYTHOLOGY = "religion_and_mythology"
    # Add more categories as needed


In [40]:
def categorize_clues_batch(clues: List[Tuple], llm_client) -> List[dict]:
    """
    Categorize multiple clues in a single API call
    
    Args:
        clues: List of (game_date, round, category, clue, answer) tuples
        llm_client: Gemini client
        batch_size: Number of clues per batch
    """
    prompt = f"""
    Think carefully about this task and ensure that your output adheres to the format specified, and includes
    no other text but the JSON.
    
    Categorize each Jeopardy clue into one of the following categories:
    {[c.value for c in Category]}
    
    If none of these categories fit well, suggest a new category prefixed with 'NEW:'.
    
    For each clue, analyze the category name, clue text, and answer to make your determination.
    
    Clues to categorize:
    {json.dumps([{
        'id': i,
        'category': c[2],
        'clue': c[3],
        'answer': c[4]
    } for i, c in enumerate(clues)], indent=2)}
    
    Return your response in JSON format with no other text:
    {{
        "categorizations": [
            {{
                "id": 0,
                "category": "existing_category or NEW:suggested_category",
                "confidence": 0-1,
                "reasoning": "brief explanation"
            }},
            ...
        ]
    }}
    """
    
    response = llm_client.generate_content(prompt)
    
    try:
        result = json.loads(response.text)
        return result['categorizations']
    except json.JSONDecodeError:
        print(f"Failed to parse response: {response.text}")
        return []

def process_uncategorized_clues_batch(llmclient,batch_size=50):
    conn = duckdb.connect('jeopardy.db')
    
    # Get batch of uncategorized clues
    clues = conn.execute("""
        SELECT game_date, round, category, clue, answer
        FROM jeopardy_clues 
        WHERE main_category IS NULL 
        LIMIT ?
    """, [batch_size]).fetchall()
    
    if not clues:
        conn.close()
        return 0
        
    categorizations = categorize_clues_batch(clues, llmclient)
    
    # Process results
    for cat in categorizations:
        clue = clues[cat['id']]
        main_cat = None
        suggested_cat = None
        
        if cat['confidence'] >= 0.7:
            if cat['category'].startswith('NEW:'):
                suggested_cat = cat['category'][4:]
            else:
                try:
                    main_cat = Category(cat['category']).value
                except ValueError:
                    suggested_cat = cat['category']
        
        conn.execute("""
            UPDATE jeopardy_clues 
            SET main_category = ?, suggested_category = ?
            WHERE game_date = ? AND round = ? AND category = ? AND clue = ?
        """, [
            main_cat,
            suggested_cat,
            clue[0], clue[1], clue[2], clue[3]
        ])
    
    conn.commit()
    conn.close()
    
    return len(clues)

In [51]:
process_uncategorized_clues_batch(model)

NotImplementedException: Not implemented Error: Unable to transform python value of type '<class 'google.generativeai.generative_models.GenerativeModel'>' to DuckDB LogicalType

In [4]:
import google.generativeai as genai


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
api_key = ''
genai.configure(api_key=api_key)


AI, or Artificial Intelligence, doesn't work in one single way.  It's a broad field encompassing many different techniques and approaches. However, at a high level, most AI systems rely on these core principles:

**1. Data:** AI systems are fundamentally driven by data.  The more relevant and high-quality data they are trained on, the better they perform. This data can be anything from images and text to sensor readings and financial transactions.

**2. Algorithms:** These are sets of rules and statistical techniques that the AI uses to process and analyze the data.  Different algorithms are suited for different tasks.  Some common types include:

* **Machine Learning (ML):** This is a subset of AI where systems learn from data without being explicitly programmed.  Instead of following hard-coded rules, they identify patterns and relationships within the data to make predictions or decisions.  There are various types of ML, including:
    * **Supervised Learning:** The algorithm learns

In [6]:
model = genai.GenerativeModel("gemini-1.5-flash")


In [8]:
category = 'PRODUCE'
clue = 'Big on vitamins A & C, this leafy vegetable also has a lot of fiber & acts as a mild laxative; look out, sailor man!'
answer = 'spinach'

In [32]:
category = 'SHE BLINDED ME WITH SCIENCE'
clue = 'Called the first female M.D. in modern times, she helped found the London School of Medicine for Women in 1875'
answer = 'Dr. Elizabeth Blackwell'

In [34]:
category = 'ALSO ON THE MONOPOLY BOARD'
clue = 'Alliterative slang for crying'
answer = 'the waterworks'

In [35]:
category = categorize_clue(clue, category, answer, model)
category


{
  "category": "language",
  "confidence": 0.8,
  "reasoning": "The clue focuses on slang, which is a linguistic concept."
}



(<Category.LANGUAGE: 'language'>, None)