# BigQuery Patent Exploration

Use this notebook to prototype filters before wiring them into the automated pipeline.

In [1]:
import os
import sys

try:
    NOTEBOOK_DIR = os.path.dirname(__file__)
except NameError:
    NOTEBOOK_DIR = os.getcwd()

PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)


In [2]:
from google.cloud import bigquery

from src import config, query_builder

## Query Updates ✅

Recent pipeline revisions align the notebook with production defaults:

- Localized text fields now prefer English entries and gracefully fall back when it is unavailable.
- Assignee aggregation uses `assignee_harmonized.name`, matching the latest BigQuery schema.
- Defaults narrow the search to the trailing three publication years and cap results at 100 rows.

Run the next cell to preview a sample result set with these defaults.


In [None]:
# Preview a sample of patents using the current configuration defaults
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv(usecwd=True) or None)
project_id = os.getenv(config.ENV_GCP_PROJECT_ID, config.DEFAULT_GCP_PROJECT_ID)
client = bigquery.Client(project=project_id)

sql = query_builder.build_query(limit=25, description_word_limit=config.DEFAULT_DESCRIPTION_WORD_LIMIT)
job_config = query_builder.assemble_query_config(
    start_year=config.DEFAULT_START_YEAR,
    end_year=config.DEFAULT_END_YEAR,
    description_word_limit=config.DEFAULT_DESCRIPTION_WORD_LIMIT,
)
preview = client.query(sql, job_config=job_config).result().to_dataframe()
preview.head()


Adjust `start_year`, `end_year`, CPC filters, or keyword lists in `src/config.py` to explore different slices of the dataset. Re-run the preview cell afterwards to inspect the impact.


## Run the pipeline programmatically

Populate `.env` (or export credentials in this notebook) before running the next cell. Adjust parameters as needed.

In [None]:
import os

from types import SimpleNamespace

from dotenv import find_dotenv, load_dotenv  # type: ignore
from src.pipeline import run_pipeline

# Load environment variables once for the notebook session.
load_dotenv(find_dotenv(usecwd=True) or None)

args = SimpleNamespace(
    project_id=None,  # Allow the pipeline to resolve CLI/env/config defaults
    start_year=config.DEFAULT_START_YEAR,
    end_year=config.DEFAULT_END_YEAR,
    limit=config.DEFAULT_LIMIT,
    output_raw="data/patents_raw.csv",
    output_classified="data/patents_classified.csv",
    openrouter_model=config.DEFAULT_OPENROUTER_MODEL,
    openrouter_timeout=config.DEFAULT_OPENROUTER_TIMEOUT,
    openrouter_delay=config.DEFAULT_OPENROUTER_DELAY,
    skip_llm=True,  # Set to False when you have OPENROUTER_API_KEY configured
    era_column=False,
    log_level="INFO",
    description_word_limit=config.DEFAULT_DESCRIPTION_WORD_LIMIT,
    max_retries=config.DEFAULT_MAX_RETRIES,
)

exit_code = run_pipeline(args)
print(f"Pipeline finished with exit code {exit_code}")
