In [22]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv

load_dotenv()

import logging
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.getLogger('web_data_collection').setLevel(logging.DEBUG)

from web_data_collection import LLMConfig, generate_search_queries, retrieve_webpages, retrieve_webpages_streaming, BrightDataConfig, generate_extraction_schema, extract_data, control_quality, JinaConfig, rerank_results_jina_api, get_url_date

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
openai_api_key = os.environ.get("OPENAI_API_KEY")
llm_config = LLMConfig(api_key=openai_api_key, model="openai/gpt-4.1", temperature=0.2)

bright_data_api_key = os.environ.get("BRIGHT_DATA_API_KEY")
bright_data_zone = os.environ.get("BRIGHT_DATA_ZONE")
bright_data_config = BrightDataConfig(api_key=bright_data_api_key, zone=bright_data_zone)

jina_api_key = os.environ.get("JINA_API_KEY")
jina_config = JinaConfig(api_key=jina_api_key)

In [7]:
dataset_description = "Reports of download counts of COVID contact tracing apps in different U.S. states."
variables = {"state_and_app_name": ["Alabama GuideSafe"]}
# variables = {"state_and_app_name": ["Alabama GuideSafe", "Arizona Covid Watch", "California Covid Notify", "Colorado Exposure Notifications", "Connecticut Covid Alert"]}
variable_names = list(variables.keys())

# Search Query Generation

In [8]:
num_queries = 1

query_templates = generate_search_queries(dataset_description, num_queries, llm_config, variable_names)
for query_template in query_templates:
    print(query_template)

{state_and_app_name} COVID contact tracing app download count


# Web Page Retrieval

In [9]:
results_pages_per_query = 1
start_date = "2020-01-01"
end_date = "2022-12-31"

webpages = retrieve_webpages(query_templates, results_pages_per_query, bright_data_config, variables)
# webpages = retrieve_webpages(query_templates, results_pages_per_query, bright_data_config, variables, start_date, end_date, time_chunking=True)
# webpages = retrieve_webpages(query_templates, results_pages_per_query, bright_data_config, variables, news_only=True)
# webpages = retrieve_webpages(query_templates, results_pages_per_query, bright_data_config, variables, media_cloud_country="United States", geolocation_country="United States")

# webpage_generator = retrieve_webpages_streaming(query_templates, results_pages_per_query, bright_data_config, variables)
# for webpage in webpage_generator:
#     print(webpage)

2025-08-05 10:39:55,551 - web_data_collection.webpage_retrieval - DEBUG - Expanded 1 templates into 1 queries
2025-08-05 10:39:55,552 - web_data_collection.webpage_retrieval - DEBUG - Processing query: Alabama GuideSafe COVID contact tracing app download count, variable value combo: ('Alabama GuideSafe',)
2025-08-05 10:39:55,553 - web_data_collection.webpage_retrieval - DEBUG - Processing date chunk: None to None
2025-08-05 10:39:55,553 - web_data_collection.webpage_retrieval - DEBUG - Making request 1/1
2025-08-05 10:39:57,327 - web_data_collection.webpage_retrieval - DEBUG - {'general': {'search_engine': 'google', 'query': 'Alabama GuideSafe COVID contact tracing app download count', 'results_cnt': 671, 'search_time': 0.38, 'language': 'en', 'location': 'United States', 'mobile': False, 'basic_view': False, 'search_type': 'text', 'page_title': 'Alabama GuideSafe COVID contact tracing app download count - Google Search', 'timestamp': '2025-08-05T14:39:57.045Z'}, 'input': {'original_ur

In [10]:
for w in webpages:
    print(w)
    for q in webpages[w]:
        print(q, len(webpages[w][q]))

('Alabama GuideSafe',)
Alabama GuideSafe COVID contact tracing app download count 20


In [11]:
all_webpages = []
for w in webpages:
    for q in webpages[w]:
        for page in webpages[w][q]:
            page["query"] = q
            all_webpages.append(page)

all_webpages

[{'link': 'https://www.guidesafe.org/exposure-notification-app/',
  'title': "Alabama's Exposure Notification App",
  'description': "Here's How Alabama's GuideSafe™ Exposure Notification App Works: ... Step one: Download the GuideSafe™ Exposure Notification App from the App Store or Google Play ...",
  'query': 'Alabama GuideSafe COVID contact tracing app download count'},
 {'link': 'https://sites.uab.edu/guidesafebackup2/exposure-notification-app/',
  'title': 'Exposure Notification App | GuideSafe™',
  'description': 'Step one: Download the GuideSafe™️ Exposure Notification App from the App Store or Google Play, and then enabling Bluetooth. Step two: If you have tested ...',
  'query': 'Alabama GuideSafe COVID contact tracing app download count'},
 {'link': 'https://www.cbs42.com/news/alabamas-contact-tracing-app-expands-capabilities-still-very-few-have-downloaded-it/',
  'title': "Alabama's contact tracing app expands capabilities, still ...",
  'description': 'Just under 150000 pe

In [12]:
len(all_webpages)

20

In [19]:
reranking_scores = rerank_results_jina_api([p["query"] for p in all_webpages], [p["title"] for p in all_webpages], jina_config)
reranking_scores

{'model': 'jina-reranker-v2-base-multilingual', 'usage': {'total_tokens': 304}, 'results': [{'index': 13, 'document': {'text': 'Alabama launches statewide coronavirus tracking app'}, 'relevance_score': 0.5945512652397156}, {'index': 12, 'document': {'text': 'Alabamians urged to download COVID-19 exposure app'}, 'relevance_score': 0.5926666259765625}, {'index': 17, 'document': {'text': "Alabama's GuideSafe app notifies you of COVID-19 exposure"}, 'relevance_score': 0.528533399105072}, {'index': 3, 'document': {'text': "Alabama's COVID tracing app joins nationwide network"}, 'relevance_score': 0.4568943679332733}, {'index': 8, 'document': {'text': 'Alabama rolls out contact tracing app piloted at colleges'}, 'relevance_score': 0.4561674892902374}, {'index': 5, 'document': {'text': 'Alabama launches contact tracing app based on Google ...'}, 'relevance_score': 0.42202815413475037}, {'index': 2, 'document': {'text': "Alabama's contact tracing app expands capabilities, still ..."}, 'relevan

[{'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'text': 'Alabama launches statewide coronavirus tracking app',
  'score': 0.5945512652397156},
 {'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'text': 'Alabamians urged to download COVID-19 exposure app',
  'score': 0.5926666259765625},
 {'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'text': "Alabama's GuideSafe app notifies you of COVID-19 exposure",
  'score': 0.528533399105072},
 {'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'text': "Alabama's COVID tracing app joins nationwide network",
  'score': 0.4568943679332733},
 {'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'text': 'Alabama rolls out contact tracing app piloted at colleges',
  'score': 0.4561674892902374},
 {'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'text': 'Alabama launches contact tracing app based on Google ..

In [20]:
title2score = {w["text"]: w["score"] for w in reranking_scores}

In [21]:
for w in all_webpages:
    w["score"] = title2score[w["title"]]

all_webpages

[{'link': 'https://www.guidesafe.org/exposure-notification-app/',
  'title': "Alabama's Exposure Notification App",
  'description': "Here's How Alabama's GuideSafe™ Exposure Notification App Works: ... Step one: Download the GuideSafe™ Exposure Notification App from the App Store or Google Play ...",
  'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'score': 0.06097517907619476},
 {'link': 'https://sites.uab.edu/guidesafebackup2/exposure-notification-app/',
  'title': 'Exposure Notification App | GuideSafe™',
  'description': 'Step one: Download the GuideSafe™️ Exposure Notification App from the App Store or Google Play, and then enabling Bluetooth. Step two: If you have tested ...',
  'query': 'Alabama GuideSafe COVID contact tracing app download count',
  'score': 0.1127954050898552},
 {'link': 'https://www.cbs42.com/news/alabamas-contact-tracing-app-expands-capabilities-still-very-few-have-downloaded-it/',
  'title': "Alabama's contact tracing app expands c

In [13]:
webpages_above_threshold = [p for p in all_webpages if p["score"] > 0.4]
webpages_above_threshold = sorted(webpages_above_threshold, key=lambda x: x["score"], reverse=True)
print(len(webpages_above_threshold))

webpages_above_threshold

15


[{'link': 'https://news.azpm.org/p/uanews/2020/9/28/180903-ua-covid-watch-app-is-now-on-30000-devices/',
  'title': 'UA COVID Watch app is now on 30000 devices',
  'description': 'About 30 thousand people have installed the COVID Watch Arizona app on their mobile devices. One of its co-creators says the app has alerted ...',
  'query': 'Arizona Covid Watch COVID app number of downloads',
  'score': 0.7895801067352295},
 {'link': 'https://www.rocketcitynow.com/video/news/local/alabama-guidesafe-covid-19-app/525-973f3a54-d4f6-4c9e-aa29-f6b7000ae470',
  'title': "Alabama's GuideSafe app notifies you of COVID-19 exposure",
  'description': 'The GuideSafe app was released in August. Over 93000 people have already downloaded it.',
  'query': 'Alabama GuideSafe COVID app number of downloads',
  'score': 0.7020716667175293},
 {'link': 'https://www.youtube.com/watch?v=-QBcypdNhEQ',
  'title': "Alabama's GuideSafe app notifies you of COVID-19 exposure",
  'description': "The GuideSafe app was re

In [None]:
get_url_date("https://www.cbs42.com/news/alabamas-contact-tracing-app-expands-capabilities-still-very-few-have-downloaded-it/")

2025-08-05 20:35:47,731 - htmldate.utils - ERROR - not a 200 response: 403 for URL https://www.cbs42.com/news/alabamas-contact-tracing-app-expands-capabilities-still-very-few-have-downloaded-it/


ValueError: ("URL couldn't be processed: %s", None)

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/thomas/miniconda3/envs/web-data-collection/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/thomas/miniconda3/envs/web-data-collection/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/thomas/miniconda3/envs/web-data-collection/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/home/thomas/miniconda3/envs/web-data-collection/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 211, in start
    self.asyncio_loop.run_forever()
  File "/home/thomas/miniconda3/envs/web-data-collection/lib/python3.12/asyncio/base_events.py", line 645, in run_forever
    self._run_once()
  File "/home/thomas/miniconda3/envs/web-data-collection/lib/pyt

: 

# Data Extraction

In [14]:
schema_fields = ["state", "date"]
schema = generate_extraction_schema(dataset_description, schema_fields, llm_config)

print(schema)

class AppDownloadReport(BaseModel):
    state: str = Field(..., description="Name of the U.S. state for which the app download count is reported")
    date: str = Field(..., description="Date for which the app download count is reported in YYYY-MM-DD format")
    download_count: int = Field(..., description="Number of downloads of COVID contact tracing apps")


In [15]:
extracted_data = await extract_data([w["link"] for w in webpages_above_threshold], schema, llm_config)



In [16]:
extracted_data

[{'state': 'Arizona',
  'date': '2020-09-28',
  'download_count': 30000,
  'grounding_quote': 'About 30 thousand people have installed the COVID Watch Arizona app on their mobile devices.',
  'is_grounded': True,
  'source': 'https://news.azpm.org/p/uanews/2020/9/28/180903-ua-covid-watch-app-is-now-on-30000-devices/#:~:text=About%2030%20thousand%20people%20have%20installed%20the%20COVID%20Watch%20Arizona%20app%20on%20their%20mobile%20devices.'},
 {'state': 'Alabama',
  'date': '2020-11-09',
  'download_count': 93000,
  'grounding_quote': 'Over 93,000 people have already downloaded it.',
  'is_grounded': True,
  'source': 'https://www.rocketcitynow.com/video/news/local/alabama-guidesafe-covid-19-app/525-973f3a54-d4f6-4c9e-aa29-f6b7000ae470#:~:text=Over%2093%2C000%20people%20have%20already%20downloaded%20it.'},
 {'state': 'California',
  'date': '2020-12-11',
  'download_count': 4000000,
  'grounding_quote': "California's Department of Public Health (CDPH), which operates CA Notify, says

# Quality Control

In [18]:
issues = control_quality(extracted_data, dataset_description, llm_config)

In [19]:
issues

{'0': 'NA',
 '1': 'NA',
 '2': 'The quote refers to "activations" rather than explicit "downloads," which may not be equivalent; also, the estimate is not an exact count and is described as "may have activated," introducing uncertainty.',
 '3': 'The quote refers to people who have "joined the system," which may not be strictly equivalent to "downloads"; also, the percentage (53%) seems high and may warrant verification against DC\'s population at the time.',
 '4': 'The quote says "more than 300,000" while the download_count is exactly 300,000, so the number may be a lower bound rather than an exact count.'}