In [None]:
# ===============================================================
#  KAGGLE SETUP & IMPORTS
# ===============================================================
!pip install -q google-generativeai

import pandas as pd
import os
import time
from kaggle_secrets import UserSecretsClient # To get our API key securely

import google.generativeai as genai
# Import the specific types for safety settings
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# For our dynamic few-shot selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Load API Key from Kaggle Secrets ---
try:
    user_secrets = UserSecretsClient()
    # Get the Google API key from the new secret
    secret_value = user_secrets.get_secret("GOOGLE_API_KEY") 
    genai.configure(api_key=secret_value)
    print("Google API key loaded successfully.")
    
    # Define robust safety settings
    safety_settings_config = [
        {
            "category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            "threshold": HarmBlockThreshold.BLOCK_NONE,
        },
        {
            "category": HarmCategory.HARM_CATEGORY_HARASSMENT,
            "threshold": HarmBlockThreshold.BLOCK_NONE,
        },
        {
            "category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            "threshold": HarmBlockThreshold.BLOCK_NONE,
        },
        {
            "category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            "threshold": HarmBlockThreshold.BLOCK_NONE,
        },
    ]
    
    # Instantiate the Gemini model
    gemini_model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
except Exception as e:
    print("Could not load Google API key or configure model. Make sure it's stored as a secret named 'GOOGLE_API_KEY'.")
    print(f"Error: {e}")
    # You might want to stop execution if the key isn't found
    # For a hackathon, we'll let it continue and fail on the API call
    gemini_model = None


# --- Load Data ---
# Adjust 'competition-folder-name' to the actual folder name in /kaggle/input/
DATA_PATH = "/kaggle/input/kenya-clinical-reasoning-challenge20250407/"
train_df = pd.read_csv(DATA_PATH + 'train.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')
sample_submission_df = pd.read_csv(DATA_PATH + 'SampleSubmission.csv')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")


In [None]:
# ===============================================================
#  DYNAMIC FEW-SHOT SELECTOR (Our "Secret Sauce")
# ===============================================================
# We will use TF-IDF to find the most similar prompts in the training data
# to use as examples in our LLM prompts. This is much better than static examples.

# Pre-process the training data for searching
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
train_prompt_vectors = vectorizer.fit_transform(train_df['Prompt'])

def get_most_similar_examples(query_prompt: str, top_n: int = 3) -> pd.DataFrame:
    """Finds the top_n most similar training examples to a given query prompt."""
    query_vector = vectorizer.transform([query_prompt])
    similarities = cosine_similarity(query_vector, train_prompt_vectors)
    # Get the indices of the top_n most similar prompts
    top_indices = np.argsort(similarities[0])[-top_n:][::-1]
    return train_df.iloc[top_indices]


In [None]:
# ===============================================================
#  STEP 1: PREDICT SNOMED CODES
# ===============================================================

def create_snomed_prompt(prompt_text: str) -> str:
    """Creates a dynamic few-shot prompt for SNOMED prediction."""
    examples_df = get_most_similar_examples(prompt_text, top_n=2)
    
    prompt = "You are a medical coding expert. Your task is to analyze a clinical vignette and extract the most relevant SNOMED CT diagnostic codes in the format 'code | description'.\n\n"
    prompt += "--- EXAMPLES ---\n"
    
    for _, row in examples_df.iterrows():
        prompt += f"Vignette: \"{row['Prompt']}\"\n"
        prompt += f"SNOMED Codes:\n{row['DDX SNOMED']}\n\n"
        
    prompt += "--- TASK ---\n"
    prompt += f"Vignette: \"{prompt_text}\"\n"
    prompt += "SNOMED Codes:\n"
    
    return prompt

def predict_snomed_codes(prompt_text: str) -> str:
    """Uses a dynamic few-shot prompt to predict SNOMED codes."""
    full_prompt = create_snomed_prompt(prompt_text)
    
    try:
        # Configuration for deterministic output
        generation_config = genai.GenerationConfig(temperature=0.0, max_output_tokens=65536)
        
        response = gemini_model.generate_content(full_prompt, generation_config=generation_config, safety_settings=safety_settings_config)
        return response.text.strip()
    except Exception as e:
        print(f"API Error in SNOMED prediction: {e}")
        # Try to see if there's a block reason
        try:
            print(response.prompt_feedback)
        except:
            pass
        return "Error: Could not predict SNOMED codes."


In [None]:
# ===============================================================
#  STEP 2: GENERATE CLINICIAN RESPONSE
# ===============================================================

def create_response_prompt(prompt_text: str, snomed_codes: str) -> str:
    """Creates a dynamic few-shot prompt for the clinician response."""
    examples_df = get_most_similar_examples(prompt_text, top_n=1)
    
    prompt = "You are an expert Kenyan clinician providing practical guidance to a colleague. Your response MUST match the persona of the nurse in the prompt and the resources available at their facility. Structure your response with a summary, diagnosis, and a clear management plan. Emulate the style and reasoning from the examples below. Be concise and direct.\n\n"
    prompt += "--- EXAMPLES ---\n"
    
    for _, row in examples_df.iterrows():
        prompt += f"Vignette: \"{row['Prompt']}\"\n"
        prompt += f"Key Diagnoses: {row['DDX SNOMED']}\n"
        prompt += f"Clinician Response: {row['Clinician']}\n\n"
        
    prompt += "--- TASK ---\n"
    prompt += f"Vignette: \"{prompt_text}\"\n"
    prompt += f"Key Diagnoses: {snomed_codes}\n"
    prompt += "Clinician Response:\n"
    
    return prompt

def generate_clinician_response(prompt_text: str, snomed_codes: str) -> str:
    """Uses a dynamic few-shot prompt to generate a clinician's response."""
    full_prompt = create_response_prompt(prompt_text, snomed_codes)
    
    try:
        # Configuration for natural language output
        generation_config = genai.GenerationConfig(temperature=0.4, max_output_tokens=65536)

        response = gemini_model.generate_content(full_prompt, generation_config=generation_config, safety_settings=safety_settings_config)
        return response.text.strip()
    except Exception as e:
        print(f"API Error in response generation: {e}")
        try:
            print(response.prompt_feedback)
        except:
            pass
        return "Error: Could not generate response."


In [None]:
# ===============================================================
#  MAIN EXECUTION SCRIPT
# ===============================================================
if gemini_model:
    results = []
    print("Starting prediction process with Google Gemini...")

    # --- Rate Limiting Setup ---
    RPM = 10  # Requests Per Minute
    requests_this_minute = 0
    start_time = time.time()
    
    for row in test_df.itertuples():
        print(f"Processing row {row.Index + 1}/{len(test_df)}: Master_Index = {row.Master_Index}")
        
        prompt_text = row.Prompt
        
        # Step 1: Predict SNOMED codes
        predicted_snomed = predict_snomed_codes(prompt_text)
        print(f"  > Predicted SNOMEDs: {repr(predicted_snomed[:70])}...")

        # Step 2: Generate the final clinician response
        final_response = generate_clinician_response(prompt_text, predicted_snomed)
        print(f"  > Generated Response: {repr(final_response[:70])}...")
        
        results.append({
            'Master_Index': row.Master_Index,
            'Clinician': final_response
        })
        print("-" * 20)

        # --- Rate Limiting Logic ---
        requests_this_minute += 2 # Each row uses two API calls
        if requests_this_minute >= RPM:
            elapsed_time = time.time() - start_time
            # if the minute hasn't passed yet
            if elapsed_time < 60:
                sleep_time = 60 - elapsed_time
                print(f"  > Rate limit reached. Sleeping for {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
                start_time = time.time()  # Reset the timer
                requests_this_minute = 0 # Reset the counter
            else: # A minute has passed
                start_time = time.time()
                requests_this_minute = 0

    print("Prediction process finished.")
    
    submission_df = pd.DataFrame(results)
    submission_df = submission_df[sample_submission_df.columns] 
    submission_df.to_csv('submission.csv', index=False)
    
    print("Submission file 'submission.csv' has been created successfully!")
    print(submission_df.head())
else:
    print("Execution halted because Gemini model could not be initialized.")
