In [None]:
import sys
import os

# Check if running in Colab
try:
    from google.colab import auth
    IS_COLAB = True
except ImportError:
    IS_COLAB = False

# Add the src directory to the path so we can import search_frontend
# Assuming the notebook is in 'notebooks/' and src is in '../src/'
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Also add the current directory just in case
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

# Set Google Application Credentials for local execution
if not IS_COLAB:
    key_path = os.path.join(project_root, 'data', 'extreme-wind-480314-f5-e88363037125.json')
    if os.path.exists(key_path):
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_path
        print(f"Set GOOGLE_APPLICATION_CREDENTIALS to {key_path}")
    else:
        print("Warning: Service account key not found in data/ folder.")

# Install dependencies if running locally and they might be missing
if not IS_COLAB:
    print("Running locally. Ensuring dependencies are installed...")
    # !pip install -q nltk flask google-cloud-storage pandas requests gcsfs pyarrow
else:
    print("Running in Colab.")
    !pip install -q gcsfs pyarrow

In [None]:
# download nltk stopwords
import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')


In [None]:
# Install a particular version of `google-cloud-storage` because (oddly enough)
# the  version on Colab and GCP is old. A dependency error below is okay.
if IS_COLAB:
    !pip install -q google-cloud-storage==1.43.0
else:
    print("Skipping GCS downgrade for local environment.")


In [None]:
# authenticate below for Google Storage access as needed
# Only run this if in Colab
try:
    from google.colab import auth
    auth.authenticate_user()
except ImportError:
    print("Not running in Colab, skipping Google Auth (assuming local credentials are set up).")


In [None]:
# Download auxiliary files from bucket if they exist
import os
from google.cloud import storage

FILES_TO_DOWNLOAD = [
    'id_to_title.pkl',
    'page_rank.pkl',
    'page_views.pkl',
    'doc_lengths.pkl',
    'doc_norm_text.pkl',
    'text_idf.pkl'
]

if IS_COLAB or os.environ.get('GOOGLE_APPLICATION_CREDENTIALS'):
    try:
        bucket_name = 'yali-ir2025-bucket'
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        
        if not os.path.exists('data'):
            os.makedirs('data')

        for filename in FILES_TO_DOWNLOAD:
            blob = bucket.blob(filename)
            if blob.exists():
                print(f"Downloading {filename}...")
                blob.download_to_filename(filename)
                # Also copy to data/ just in case
                import shutil
                shutil.copy(filename, f'data/{filename}')
            else:
                print(f"{filename} not found in bucket. Skipping.")
            
    except Exception as e:
        print(f"Error downloading files: {e}")
else:
    print("No credentials. Skipping file downloads.")

# Ensure we have at least dummy data for doc_lengths to prevent ZeroDivisionError
import pickle
if not os.path.exists('doc_lengths.pkl') and not os.path.exists('data/doc_lengths.pkl'):
    print("Creating dummy doc_lengths.pkl to prevent ZeroDivisionError...")
    dummy_dl = {1: 1} # At least one document
    with open('doc_lengths.pkl', 'wb') as f:
        pickle.dump(dummy_dl, f)
    if not os.path.exists('data'):
        os.makedirs('data')
    import shutil
    shutil.copy('doc_lengths.pkl', 'data/doc_lengths.pkl')

# Run the app

In [None]:
# Import the frontend module
# Ensure you have 'src' in your python path (see cell above)
try:
    import search_frontend as se
except ImportError:
    # Fallback if running from a different directory context
    from src import search_frontend as se


In [None]:
# uncomment the code below and execute to reload the module when you make
# changes to search_frontend.py (after you upload again).
# import importlib
# importlib.reload(se)

In [None]:
import threading
import time
import requests

PORT = 8080

def run_server():
    # Run the app
    # Note: use_reloader=False is important in notebooks/background threads
    try:
        se.app.run(host='0.0.0.0', port=PORT, debug=False, use_reloader=False)
    except Exception as e:
        print(f"Server failed to start: {e}")

# Start the server in a background thread
print(f"Starting server on port {PORT}...")
server_thread = threading.Thread(target=run_server)
server_thread.daemon = True
server_thread.start()

# Wait for server to start
time.sleep(3) 

# Check if server is up
if IS_COLAB:
    from google.colab.output import eval_js
    server_url = eval_js(f"google.colab.kernel.proxyPort({PORT})")
    print(f"Colab Server URL: {server_url}")
else:
    server_url = f"http://127.0.0.1:{PORT}"
    print(f"Local Server URL: {server_url}")

try:
    # Simple health check
    requests.get(server_url)
    print("Server is up and running!")
except:
    print("Warning: Server might not be reachable yet or failed to start.")

print(f"Test URL: {server_url}/search_body?query=hello+world")


# Testing your app

Once your app is running you can query it. You can simply do that by clicking on the URL printed above (the one looking like https://XXXXX-5000-colab.googleusercontent.com/search?query=hello+world or by issuing an HTTP request through code (from colab).

The code below shows how to issue a query from python. This is also how our testing code will issue queries to your search engine, so make sure to test your search engine this way after you deploy it to GCP and before submission. Command line instructions for deploying your search engine to GCP are available at `run_frontend_in_gcp.sh`. Note that we will not only issue training queries to your search engine, but also test queries, i.e. queries that you've never seen before.

In [None]:
import json
import os

# Path to queries file
# Try 'data/queries_train.json' (relative to project root) or just 'queries_train.json'
queries_path = os.path.join(project_root, 'data', 'queries_train.json')
if not os.path.exists(queries_path):
    queries_path = 'queries_train.json' # Fallback

print(f"Loading queries from: {queries_path}")
with open(queries_path, 'rt') as f:
  queries = json.load(f)


In [None]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [None]:
def precision_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(predicted_list) == 0:
        return 0.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(predicted_list), 3)
def recall_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(true_set) < 1:
        return 1.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(true_set), 3)
def f1_at_k(true_list, predicted_list, k):
    p = precision_at_k(true_list, predicted_list, k)
    r = recall_at_k(true_list, predicted_list, k)
    if p == 0.0 or r == 0.0:
        return 0.0
    return round(2.0 / (1.0/p + 1.0/r), 3)
def results_quality(true_list, predicted_list):
    p5 = precision_at_k(true_list, predicted_list, 5)
    f1_30 = f1_at_k(true_list, predicted_list, 30)
    if p5 == 0.0 or f1_30 == 0.0:
        return 0.0
    return round(2.0 / (1.0/p5 + 1.0/f1_30), 3)

assert precision_at_k(range(10), [1,2,3] , 2) == 1.0
assert recall_at_k(   range(10), [10,5,3], 2) == 0.1
assert precision_at_k(range(10), []      , 2) == 0.0
assert precision_at_k([],        [1,2,3],  5) == 0.0
assert recall_at_k(   [],        [10,5,3], 2) == 1.0
assert recall_at_k(   range(10), [],       2) == 0.0
assert f1_at_k(       [],        [1,2,3],  5) == 0.0
assert f1_at_k(       range(10), [],       2) == 0.0
assert f1_at_k(       range(10), [0,1,2],  2) == 0.333
assert f1_at_k(       range(50), range(5), 30) == 0.182
assert f1_at_k(       range(50), range(10), 30) == 0.333
assert f1_at_k(       range(50), range(30), 30) == 0.75
assert results_quality(range(50), range(5))  == 0.308
assert results_quality(range(50), range(10)) == 0.5
assert results_quality(range(50), range(30)) == 0.857
assert results_quality(range(50), [-1]*5 + list(range(5,30))) == 0.0


In [None]:
import requests
from time import time

# Use the server_url defined in the previous cell
# If it's not defined, fallback to localhost
if 'server_url' not in locals():
    server_url = 'http://127.0.0.1:8080'

print(f"Testing against: {server_url}")

qs_res = []
for q, true_wids in queries.items():
  duration, ap = None, None
  t_start = time()
  try:
    # Note: Using /search_body for now as that's what we implemented
    # The original code used /search. You can change this back later.
    res = requests.get(server_url + '/search_body', {'query': q}, timeout=35)
    duration = time() - t_start
    if res.status_code == 200:
      # The response is a list of (doc_id, title)
      # We need just the doc_ids for evaluation
      results = res.json()
      pred_wids = [str(doc_id) for doc_id, title in results]
      
      # Calculate quality metrics
      rq = results_quality(true_wids, pred_wids)
      ap = average_precision(true_wids, pred_wids)
      
      print(f"Query: {q} | Duration: {duration:.3f}s | Quality: {rq} | AP: {ap}")
    else:
        print(f"Query: {q} | Failed with status {res.status_code}")
  except Exception as e:
    print(f"Query: {q} | Error: {e}")

  qs_res.append((q, duration, ap))
