In [12]:
import sys
import os

# Check if running in Colab
try:
    from google.colab import auth
    IS_COLAB = True
except ImportError:
    IS_COLAB = False

# Add the src directory to the path so we can import search_frontend
# Assuming the notebook is in 'notebooks/' and src is in '../src/'
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Also add the current directory just in case
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

# Install dependencies if running locally and they might be missing
if not IS_COLAB:
    print("Running locally. Ensuring dependencies are installed...")
    !pip install -q nltk flask google-cloud-storage pandas requests gcsfs pyarrow
else:
    print("Running in Colab.")
    !pip install -q gcsfs pyarrow

Running locally. Ensuring dependencies are installed...



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
# download nltk stopwords
import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')


In [14]:
# Install a particular version of `google-cloud-storage` because (oddly enough)
# the  version on Colab and GCP is old. A dependency error below is okay.
if IS_COLAB:
    !pip install -q google-cloud-storage==1.43.0
else:
    print("Skipping GCS downgrade for local environment.")


Skipping GCS downgrade for local environment.


In [15]:
# authenticate below for Google Storage access as needed
# Only run this if in Colab
try:
    from google.colab import auth
    auth.authenticate_user()
except ImportError:
    print("Not running in Colab, skipping Google Auth (assuming local credentials are set up).")


Not running in Colab, skipping Google Auth (assuming local credentials are set up).


In [16]:
# Generate id_to_title.pkl from parquet files if missing
import pandas as pd
import pickle
import os
from google.cloud import storage

# Only run this if we are in Colab or have auth
if IS_COLAB:
    try:
        bucket_name = 'yali-ir2025-bucket' # Hardcoded from config
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        
        # Check if id_to_title.pkl already exists in bucket
        blob = bucket.blob('id_to_title.pkl')
        if blob.exists():
            print("id_to_title.pkl already exists in bucket. Downloading...")
            blob.download_to_filename('id_to_title.pkl')
        else:
            print("Generating id_to_title.pkl from parquet files...")
            # List parquet files
            blobs = list(bucket.list_blobs(prefix='multistream'))
            parquet_files = [b.name for b in blobs if b.name.endswith('.parquet')]
            
            id_to_title = {}
            
            for pq_file in parquet_files:
                print(f"Processing {pq_file}...")
                # Read parquet file directly from GCS
                uri = f"gs://{bucket_name}/{pq_file}"
                try:
                    df = pd.read_parquet(uri, columns=['id', 'title'])
                    for index, row in df.iterrows():
                        id_to_title[str(row['id'])] = row['title']
                except Exception as e:
                    print(f"Error reading {pq_file}: {e}")
            
            print(f"Collected {len(id_to_title)} titles.")
            
            # Save to local
            with open('id_to_title.pkl', 'wb') as f:
                pickle.dump(id_to_title, f)
            
            # Upload to bucket
            print("Uploading id_to_title.pkl to bucket...")
            blob.upload_from_filename('id_to_title.pkl')
            print("Done.")
            
        # Move to data folder if it exists
        if not os.path.exists('data'):
            os.makedirs('data')
        if os.path.exists('id_to_title.pkl'):
            import shutil
            shutil.copy('id_to_title.pkl', 'data/id_to_title.pkl')
            print("Copied id_to_title.pkl to data/ folder.")
            
    except Exception as e:
        print(f"Error in id_to_title generation: {e}")
else:
    print("Not in Colab. If you have id_to_title.pkl in bucket, download it manually or run this in Colab.")

Not in Colab. If you have id_to_title.pkl in bucket, download it manually or run this in Colab.


# Run the app

In [17]:
# Import the frontend module
# Ensure you have 'src' in your python path (see cell above)
try:
    import search_frontend as se
except ImportError:
    # Fallback if running from a different directory context
    from src import search_frontend as se


In [18]:
# uncomment the code below and execute to reload the module when you make
# changes to search_frontend.py (after you upload again).
# import importlib
# importlib.reload(se)

In [19]:
import threading
import time
import requests

PORT = 8080

def run_server():
    # Run the app
    # Note: use_reloader=False is important in notebooks/background threads
    try:
        se.app.run(host='0.0.0.0', port=PORT, debug=False, use_reloader=False)
    except Exception as e:
        print(f"Server failed to start: {e}")

# Start the server in a background thread
print(f"Starting server on port {PORT}...")
server_thread = threading.Thread(target=run_server)
server_thread.daemon = True
server_thread.start()

# Wait for server to start
time.sleep(3) 

# Check if server is up
if IS_COLAB:
    from google.colab.output import eval_js
    server_url = eval_js(f"google.colab.kernel.proxyPort({PORT})")
    print(f"Colab Server URL: {server_url}")
else:
    server_url = f"http://127.0.0.1:{PORT}"
    print(f"Local Server URL: {server_url}")

try:
    # Simple health check
    requests.get(server_url)
    print("Server is up and running!")
except:
    print("Warning: Server might not be reachable yet or failed to start.")

print(f"Test URL: {server_url}/search_body?query=hello+world")


Starting server on port 8080...
 * Serving Flask app 'search_frontend'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8080
 * Running on http://192.168.7.14:8080
Press CTRL+C to quit
127.0.0.1 - - [02/Jan/2026 16:15:25] "GET / HTTP/1.1" 404 -


Local Server URL: http://127.0.0.1:8080
Server is up and running!
Test URL: http://127.0.0.1:8080/search_body?query=hello+world


# Testing your app

Once your app is running you can query it. You can simply do that by clicking on the URL printed above (the one looking like https://XXXXX-5000-colab.googleusercontent.com/search?query=hello+world or by issuing an HTTP request through code (from colab).

The code below shows how to issue a query from python. This is also how our testing code will issue queries to your search engine, so make sure to test your search engine this way after you deploy it to GCP and before submission. Command line instructions for deploying your search engine to GCP are available at `run_frontend_in_gcp.sh`. Note that we will not only issue training queries to your search engine, but also test queries, i.e. queries that you've never seen before.

In [20]:
import json
import os

# Path to queries file
# Try 'data/queries_train.json' (relative to project root) or just 'queries_train.json'
queries_path = os.path.join(project_root, 'data', 'queries_train.json')
if not os.path.exists(queries_path):
    queries_path = 'queries_train.json' # Fallback

print(f"Loading queries from: {queries_path}")
with open(queries_path, 'rt') as f:
  queries = json.load(f)


Loading queries from: c:\Users\User\Desktop\סמסטר א\אחזור מידע\פרויקט\data\queries_train.json


In [21]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [22]:
def precision_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(predicted_list) == 0:
        return 0.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(predicted_list), 3)
def recall_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(true_set) < 1:
        return 1.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(true_set), 3)
def f1_at_k(true_list, predicted_list, k):
    p = precision_at_k(true_list, predicted_list, k)
    r = recall_at_k(true_list, predicted_list, k)
    if p == 0.0 or r == 0.0:
        return 0.0
    return round(2.0 / (1.0/p + 1.0/r), 3)
def results_quality(true_list, predicted_list):
    p5 = precision_at_k(true_list, predicted_list, 5)
    f1_30 = f1_at_k(true_list, predicted_list, 30)
    if p5 == 0.0 or f1_30 == 0.0:
        return 0.0
    return round(2.0 / (1.0/p5 + 1.0/f1_30), 3)

assert precision_at_k(range(10), [1,2,3] , 2) == 1.0
assert recall_at_k(   range(10), [10,5,3], 2) == 0.1
assert precision_at_k(range(10), []      , 2) == 0.0
assert precision_at_k([],        [1,2,3],  5) == 0.0
assert recall_at_k(   [],        [10,5,3], 2) == 1.0
assert recall_at_k(   range(10), [],       2) == 0.0
assert f1_at_k(       [],        [1,2,3],  5) == 0.0
assert f1_at_k(       range(10), [],       2) == 0.0
assert f1_at_k(       range(10), [0,1,2],  2) == 0.333
assert f1_at_k(       range(50), range(5), 30) == 0.182
assert f1_at_k(       range(50), range(10), 30) == 0.333
assert f1_at_k(       range(50), range(30), 30) == 0.75
assert results_quality(range(50), range(5))  == 0.308
assert results_quality(range(50), range(10)) == 0.5
assert results_quality(range(50), range(30)) == 0.857
assert results_quality(range(50), [-1]*5 + list(range(5,30))) == 0.0


In [23]:
import requests
from time import time

# Use the server_url defined in the previous cell
# If it's not defined, fallback to localhost
if 'server_url' not in locals():
    server_url = 'http://127.0.0.1:8080'

print(f"Testing against: {server_url}")

qs_res = []
for q, true_wids in queries.items():
  duration, ap = None, None
  t_start = time()
  try:
    # Note: Using /search_body for now as that's what we implemented
    # The original code used /search. You can change this back later.
    res = requests.get(server_url + '/search_body', {'query': q}, timeout=35)
    duration = time() - t_start
    if res.status_code == 200:
      # The response is a list of (doc_id, title)
      # We need just the doc_ids for evaluation
      results = res.json()
      pred_wids = [str(doc_id) for doc_id, title in results]
      
      # Calculate quality metrics
      rq = results_quality(true_wids, pred_wids)
      ap = average_precision(true_wids, pred_wids)
      
      print(f"Query: {q} | Duration: {duration:.3f}s | Quality: {rq} | AP: {ap}")
    else:
        print(f"Query: {q} | Failed with status {res.status_code}")
  except Exception as e:
    print(f"Query: {q} | Error: {e}")

  qs_res.append((q, duration, ap))


Testing against: http://127.0.0.1:8080


127.0.0.1 - - [02/Jan/2026 16:15:34] "GET /search_body?query=Mount+Everest+climbing+expeditions HTTP/1.1" 200 -


Query: Mount Everest climbing expeditions | Duration: 9.441s | Quality: 0.198 | AP: 0.569


127.0.0.1 - - [02/Jan/2026 16:15:51] "GET /search_body?query=Great+Fire+of+London+1666 HTTP/1.1" 200 -


Query: Great Fire of London 1666 | Duration: 16.781s | Quality: 0.0 | AP: 0.11


127.0.0.1 - - [02/Jan/2026 16:15:59] "GET /search_body?query=Nanotechnology+materials+science HTTP/1.1" 200 -


Query: Nanotechnology materials science | Duration: 7.659s | Quality: 0.571 | AP: 0.704


127.0.0.1 - - [02/Jan/2026 16:16:08] "GET /search_body?query=Fossil+fuels+climate+change HTTP/1.1" 200 -


Query: Fossil fuels climate change | Duration: 9.257s | Quality: 0.291 | AP: 0.403


127.0.0.1 - - [02/Jan/2026 16:16:18] "GET /search_body?query=DNA+double+helix+discovery HTTP/1.1" 200 -


Query: DNA double helix discovery | Duration: 9.741s | Quality: 0.311 | AP: 0.396


127.0.0.1 - - [02/Jan/2026 16:16:28] "GET /search_body?query=Printing+press+invention+Gutenberg HTTP/1.1" 200 -


Query: Printing press invention Gutenberg | Duration: 10.777s | Quality: 0.425 | AP: 0.577


127.0.0.1 - - [02/Jan/2026 16:16:38] "GET /search_body?query=Ancient+Egypt+pyramids+pharaohs HTTP/1.1" 200 -


Query: Ancient Egypt pyramids pharaohs | Duration: 9.482s | Quality: 0.377 | AP: 0.488


127.0.0.1 - - [02/Jan/2026 16:16:47] "GET /search_body?query=Gothic+literature+Mary+Shelley HTTP/1.1" 200 -


Query: Gothic literature Mary Shelley | Duration: 9.426s | Quality: 0.236 | AP: 0.41


127.0.0.1 - - [02/Jan/2026 16:16:55] "GET /search_body?query=Robotics+automation+industry HTTP/1.1" 200 -


Query: Robotics automation industry | Duration: 7.544s | Quality: 0.283 | AP: 0.375


127.0.0.1 - - [02/Jan/2026 16:17:07] "GET /search_body?query=Television+invention+broadcast+media HTTP/1.1" 200 -


Query: Television invention broadcast media | Duration: 12.123s | Quality: 0.0 | AP: 0.0


127.0.0.1 - - [02/Jan/2026 16:17:15] "GET /search_body?query=Wright+brothers+first+flight HTTP/1.1" 200 -


Query: Wright brothers first flight | Duration: 7.703s | Quality: 0.0 | AP: 0.079


127.0.0.1 - - [02/Jan/2026 16:17:21] "GET /search_body?query=Steam+locomotive+transportation+history HTTP/1.1" 200 -


Query: Steam locomotive transportation history | Duration: 6.126s | Quality: 0.175 | AP: 0.379


127.0.0.1 - - [02/Jan/2026 16:17:29] "GET /search_body?query=Currency+history+gold+standard HTTP/1.1" 200 -


Query: Currency history gold standard | Duration: 7.813s | Quality: 0.201 | AP: 0.246


127.0.0.1 - - [02/Jan/2026 16:17:38] "GET /search_body?query=Renaissance+art+Leonardo+da+Vinci HTTP/1.1" 200 -


Query: Renaissance art Leonardo da Vinci | Duration: 9.583s | Quality: 0.457 | AP: 0.585


127.0.0.1 - - [02/Jan/2026 16:17:48] "GET /search_body?query=Shakespeare+plays+Elizabethan+theatre HTTP/1.1" 200 -


Query: Shakespeare plays Elizabethan theatre | Duration: 10.296s | Quality: 0.188 | AP: 0.261


127.0.0.1 - - [02/Jan/2026 16:17:57] "GET /search_body?query=Solar+eclipse+astronomy+observation HTTP/1.1" 200 -


Query: Solar eclipse astronomy observation | Duration: 8.317s | Quality: 0.198 | AP: 0.459


127.0.0.1 - - [02/Jan/2026 16:18:06] "GET /search_body?query=Renaissance+architecture+Florence+Italy HTTP/1.1" 200 -


Query: Renaissance architecture Florence Italy | Duration: 8.786s | Quality: 0.25 | AP: 0.572


127.0.0.1 - - [02/Jan/2026 16:18:11] "GET /search_body?query=Impressionism+Monet+Renoir HTTP/1.1" 200 -


Query: Impressionism Monet Renoir | Duration: 5.906s | Quality: 0.364 | AP: 0.44


127.0.0.1 - - [02/Jan/2026 16:18:22] "GET /search_body?query=Samurai+code+Bushido+Japan HTTP/1.1" 200 -


Query: Samurai code Bushido Japan | Duration: 10.145s | Quality: 0.231 | AP: 0.466


127.0.0.1 - - [02/Jan/2026 16:18:33] "GET /search_body?query=Fossil+record+paleontology+evidence HTTP/1.1" 200 -


Query: Fossil record paleontology evidence | Duration: 11.747s | Quality: 0.165 | AP: 0.317


127.0.0.1 - - [02/Jan/2026 16:18:45] "GET /search_body?query=Silk+Road+trade+cultural+exchange HTTP/1.1" 200 -


Query: Silk Road trade cultural exchange | Duration: 12.114s | Quality: 0.165 | AP: 0.32


127.0.0.1 - - [02/Jan/2026 16:18:54] "GET /search_body?query=Industrial+Revolution+steam+engines HTTP/1.1" 200 -


Query: Industrial Revolution steam engines | Duration: 8.812s | Quality: 0.417 | AP: 0.566


127.0.0.1 - - [02/Jan/2026 16:19:07] "GET /search_body?query=Green+Revolution+agriculture+yield HTTP/1.1" 200 -


Query: Green Revolution agriculture yield | Duration: 12.460s | Quality: 0.0 | AP: 0.04


127.0.0.1 - - [02/Jan/2026 16:19:20] "GET /search_body?query=Quantum+computing+future+technology HTTP/1.1" 200 -


Query: Quantum computing future technology | Duration: 13.532s | Quality: 0.368 | AP: 0.701


127.0.0.1 - - [02/Jan/2026 16:19:40] "GET /search_body?query=Viking+exploration+North+America HTTP/1.1" 200 -


Query: Viking exploration North America | Duration: 19.591s | Quality: 0.229 | AP: 0.411


127.0.0.1 - - [02/Jan/2026 16:19:50] "GET /search_body?query=Roman+aqueducts+engineering+innovation HTTP/1.1" 200 -


Query: Roman aqueducts engineering innovation | Duration: 10.432s | Quality: 0.202 | AP: 0.385


127.0.0.1 - - [02/Jan/2026 16:19:56] "GET /search_body?query=Coffee+history+Ethiopia+trade HTTP/1.1" 200 -


Query: Coffee history Ethiopia trade | Duration: 5.964s | Quality: 0.447 | AP: 0.573


127.0.0.1 - - [02/Jan/2026 16:20:02] "GET /search_body?query=Stonehenge+prehistoric+monument HTTP/1.1" 200 -


Query: Stonehenge prehistoric monument | Duration: 5.286s | Quality: 0.397 | AP: 0.565


127.0.0.1 - - [02/Jan/2026 16:20:07] "GET /search_body?query=Photography+invention+Daguerre HTTP/1.1" 200 -


Query: Photography invention Daguerre | Duration: 5.856s | Quality: 0.381 | AP: 0.615


127.0.0.1 - - [02/Jan/2026 16:20:24] "GET /search_body?query=Ballet+origins+France+Russia HTTP/1.1" 200 -


Query: Ballet origins France Russia | Duration: 16.154s | Quality: 0.193 | AP: 0.255
