In [5]:
# Imports for data management
!pip install pandas Unidecode bs4
import pandas as pd
import json

# Import for crawling and scraping
import requests
import time
import random
from bs4 import BeautifulSoup
import re 
from urllib.parse import urlparse, unquote
from typing import List 
from pathlib import Path 




In [9]:
def get_biz_data(key_word, loc, page) -> pd.DataFrame:
    url = f"https://www.superpages.com/search?search_terms={key_word}&geo_location_terms={loc}&page={page}"
    print(f"Scraping URL: {url}")
    page = requests.get(url)
    # table = html.fromstring(unidecode.unidecode(page.content.decode('utf-8')))
    soup = BeautifulSoup(page.content.decode('utf-8'), 'html.parser')
    business_containers = soup.find_all("div", class_="result")
    # Iterate over each container to extract business name and URL
    data = []
    for container in business_containers:
        business_name = ""
        phone_num = ""
        addr = ""
        business_url = ""

        try:
            business_name_element = container.find("a", class_="business-name")
            business_name = business_name_element.text
        except Exception as e:
            print(f"Unable to parse Business Name. Error: {e}")
        
        try:
            phone_num_elem = container.find("a", class_="phones phone primary")
            phone_num = phone_num_elem.text
        except Exception as e:
            print(f"Unable to parse Phone number. Error: {e}")
        
        # try:
        #     addr_elem = container.find("div", class_="street-address")
        #     addr = addr_elem.text
        # except Exception as e:
        #     print(f"Unable to parse address. Error: {e}")

        try:    
            business_url = "https://www.superpages.com" + business_name_element["href"]
        except Exception as e:
            print(f"Unable to parse business URL. Error: {e}")
        data.append([business_name, business_url, phone_num])

    df = pd.DataFrame(columns=['business_name', 'business_url', 'phone_number'], data=data)

    return df


In [12]:
def get_multi_page_biz_data(key_word, loc, num_results) -> pd.DataFrame:
    df = pd.DataFrame(columns=['business_name', 'business_url', 'phone_number'])
    page_size = 20
    while len(df) < num_results:
        page_num = len(df) // page_size
        print(f"Scraping page {page_num}")
        page_data = get_biz_data(key_word, loc, page_num)
        df = pd.concat([df, page_data], ignore_index=True)
        

        sleep_time = random.randint(1, 3)
        time.sleep(sleep_time)
    print(f"Scraped {len(df)} results")
    return df 

In [13]:
key_word: str = "Security+Camera+Installation"
loc:str = "Washington"
num_results = 10

df = get_multi_page_biz_data(key_word, loc, num_results)
df.head()

Scraping page 0
Scraping URL: https://www.superpages.com/search?search_terms=Security+Camera+Installation&geo_location_terms=Washington&page=0
Unable to parse Phone number. Error: 'NoneType' object has no attribute 'text'
Unable to parse Phone number. Error: 'NoneType' object has no attribute 'text'
Scraped 31 results


Unnamed: 0,business_name,business_url,phone_number
0,ADT - Official Sales Center,https://www.superpages.comhttps://www.superpag...,
1,Vivint,https://www.superpages.comhttps://www.superpag...,
2,ADT - Official Sales Center,https://www.superpages.com/nationwide/bpp/adt-...,888-671-7048Call Now
3,ADT Security Services,https://www.superpages.com/nationwide/bpp/adt-...,800-270-4862Call Now
4,Vivint,https://www.superpages.com/nationwide/bpp/vivi...,833-200-4264Call Now


In [14]:
output_file_path:Path = Path('./data/super_pages.csv')
if output_file_path.parent.exists() == False:
    output_file_path.parent.mkdir(parents=True, exist_ok=True)    

df.to_csv(output_file_path.as_posix(), index=False)
print(f"[{len(df)}] Data saved to {output_file_path.as_posix()}")

[31] Data saved to data/super_pages.csv
