In [1]:
# Imports for data management
!pip install pandas Unidecode bs4
import pandas as pd
import json

# Import for crawling and scraping
import requests
import time
import random
from bs4 import BeautifulSoup
import re 
from urllib.parse import urlparse, unquote
from typing import List 
from pathlib import Path 




In [10]:
def get_biz_data(key_word, loc, page) -> pd.DataFrame:
    url = f"https://www.hotfrog.com/search/{loc}/{key_word}/{page}"
    print(f"Scraping URL: {url}")
    page = requests.get(url)
    soup = BeautifulSoup(page.content.decode('utf-8'), 'html.parser')
    business_containers = soup.find_all("div", class_="row")
    # Iterate over each container to extract business name and URL
    data = []
    for container in business_containers:
        business_name = ""
        phone_num = ""
        addr = ""
        business_url = ""

        try:
            business_name_element = container.find("h3", class_="h6 mb-0")
            business_name = business_name_element.text
        except Exception as e:
            print(f"Unable to parse Business Name. Error: {e}")
        
        try:
            phone_elem = container.find("div", class_="w-100 small text-nowrap")
            phone_num_elem = phone_elem.find("a")
            phone_num = phone_num_elem.text

        except Exception as e:
            print(f"Unable to parse Phone number. Error: {e}")

        try:    
            business_url_container_elem = container.find("h3", class_="h6 mb-0")
            business_url_elem = business_url_container_elem.find("a")
            business_url = business_url_elem.get("href")
            business_url = "https://www.hotfrog.com" + business_url
        except Exception as e:
            print(f"Unable to parse business URL. Error: {e}")
        data.append([business_name, business_url, phone_num])

    df = pd.DataFrame(columns=['business_name', 'business_url', 'phone_number'], data=data)

    return df


In [11]:
def get_multi_page_biz_data(key_word, loc, num_results) -> pd.DataFrame:
    df = pd.DataFrame(columns=['business_name', 'business_url', 'phone_number'])
    page_size = 20
    while len(df) < num_results:
        page_num = len(df) // page_size
        print(f"Scraping page {page_num}")
        page_data = get_biz_data(key_word, loc, page_num)
        df = pd.concat([df, page_data], ignore_index=True)
        

        sleep_time = random.randint(1, 3)
        time.sleep(sleep_time)
    print(f"Scraped {len(df)} results")
    return df 

In [12]:
key_word: str = "Security+Camera+Installation"
loc:str = "Washington"
num_results = 10

df = get_multi_page_biz_data(key_word, loc, num_results)
df.head()

Scraping page 0
Scraping URL: https://www.hotfrog.com/search/Washington/Security+Camera+Installation/0
Unable to parse Business Name. Error: 'NoneType' object has no attribute 'text'
Unable to parse Phone number. Error: 'NoneType' object has no attribute 'find'
Unable to parse business URL. Error: 'NoneType' object has no attribute 'find'
Unable to parse Business Name. Error: 'NoneType' object has no attribute 'text'
Unable to parse Phone number. Error: 'NoneType' object has no attribute 'find'
Unable to parse business URL. Error: 'NoneType' object has no attribute 'find'
Unable to parse Business Name. Error: 'NoneType' object has no attribute 'text'
Unable to parse Phone number. Error: 'NoneType' object has no attribute 'find'
Unable to parse business URL. Error: 'NoneType' object has no attribute 'find'
Scraped 16 results


Unnamed: 0,business_name,business_url,phone_number
0,,,
1,"Sentry Installation, Inc.",https://www.hotfrog.com/company/10995534760058...,301 563-9314
2,"Sentry Installation, Inc.",https://www.hotfrog.com/company/10995534760058...,301 563-9314
3,Global-Pro-Tech Security Systems,https://www.hotfrog.com/company/10994798459043...,301 502-5017
4,Fortress Security LLC,https://www.hotfrog.com/company/10995979568537...,502 797-6851


In [13]:
output_file_path:Path = Path('./data/dexknows.csv')
if output_file_path.parent.exists() == False:
    output_file_path.parent.mkdir(parents=True, exist_ok=True)    

df.to_csv(output_file_path.as_posix(), index=False)
print(f"[{len(df)}] Data saved to {output_file_path.as_posix()}")

[16] Data saved to data/dexknows.csv
