In [15]:
# Imports for data management
!pip install pandas Unidecode bs4
import pandas as pd
import json

# Import for crawling and scraping
import requests
import time
import random
from bs4 import BeautifulSoup
import re 
from urllib.parse import urlparse, unquote
from typing import List 
from pathlib import Path 




In [16]:
def yelp_biz_urls(key_word, loc, num_results):
    """
    Getting results from yelp
    The output is a list of business urls
    """
    tables = []
    for i in range(0,num_results,10):
        # rate limiting
        time.sleep(random.randint(1, 2) * .931467298)

        url = 'http://www.yelp.com/search?find_desc=%(key_word)s&find_loc=%(loc)s&start=%(num_results)s' % {'key_word':key_word,'loc':loc,'num_results':i}
        print(f"Iteration [{i}] - Searching for [{key_word} in [{loc}] with offset [{i}]. URL: [{url}]")

        page = requests.get(url)
        # table = html.fromstring(unidecode.unidecode(page.content.decode('utf-8')))
        soup = BeautifulSoup(page.content.decode('utf-8'), 'html.parser')
        business_containers = soup.find_all("li", class_="yelp-emotion-1iy1dwt")
        # Iterate over each container to extract business name and URL
        for container in business_containers:
            business_name_element = container.find("h3", class_="yelp-emotion-i7hfd5")
            business_url_element = container.find("a", class_="yelp-emotion-idvn5q")

            if business_name_element and business_url_element:
                business_url = business_url_element["href"]
                prefix = "https://www.yelp.com"
                tables.append(prefix+business_url)
    print(f"Found [{len(tables)}] businesses")
    return tables

In [17]:
def get_yelp_biz_data(business_url) -> pd.DataFrame:
    print(f"Looking into [{business_url}]")
    page = requests.get(business_url)

    soup = BeautifulSoup(page.content.decode('utf-8'), 'html.parser')
    
    business_name = ""
    business_url = ""
    phone_number = ""

    try:
        business_name = soup.find('h1', class_='yelp-emotion-sfde2o').text.strip()
    except Exception as e:
        print(f"Unable to get business name, skipping... Error: {e}. [{business_url}]")

    containers = soup.find_all("section", class_="yelp-emotion-7hi8nk")
    try:
        for container in containers:
            # Extracting business URL
            if container.find('a', class_='yelp-emotion-33t6hm') != None:
                raw_url = container.find('a', class_='yelp-emotion-33t6hm')['href']
                parsed_url = urlparse(raw_url)
                url_params = parsed_url.query.split('&')
                for param in url_params:
                    key, val = param.split("=")
                    if key == 'url':
                        biz_url_encoded = val
                        biz_url = unquote(biz_url_encoded)
                        business_url = biz_url
                        break 
    except Exception as e:
        print(f"Unable to get business url, skipping... Error: {e}. [{business_url}]")
        
    # extracting phone number
    try:
        phone_number = soup.find('p', class_='yelp-emotion-1be33sw', string=re.compile(r'\(\d{3}\) \d{3}-\d{4}')).text.strip()
    except Exception as e:
        print(f"Unable to get phone number, skipping... Error: {e}. [{business_url}]")
    df = pd.DataFrame(columns=['business_name', 'business_url', 'phone_number'], data=[[business_name, business_url, phone_number]])

    return df 
    


In [18]:
def get_multi_business_data(urls: List[str]):
    df = pd.DataFrame(columns=['business_name', 'business_url', 'phone_number'])
    for url in urls:
        row = get_yelp_biz_data(business_url=url)
        df = pd.concat([row, df], ignore_index=True)
    return df

In [19]:
KEYWORD = 'Security Camera Installation'
LOCATION = 'Los Angeles'
NUM = 10
data = yelp_biz_urls(key_word=KEYWORD, loc=LOCATION, num_results=NUM)
if len(data) > 0:
    print(f"Sample: [{data[0]}]")

Iteration [0] - Searching for [Security Camera Installation in [Los Angeles] with offset [0]. URL: [http://www.yelp.com/search?find_desc=Security Camera Installation&find_loc=Los Angeles&start=0]
Found [10] businesses
Sample: [https://www.yelp.com/biz/los-angeles-cctv-security-cameras-los-angeles-4?osq=Security+Camera+Installation&override_cta=Request+quote+%26+availability]


In [20]:
df = get_multi_business_data(urls=data)
df

Looking into [https://www.yelp.com/biz/los-angeles-cctv-security-cameras-los-angeles-4?osq=Security+Camera+Installation&override_cta=Request+quote+%26+availability]
Looking into [https://www.yelp.com/biz/three-factor-security-cameras-installation-beverly-hills-2?osq=Security+Camera+Installation&override_cta=Request+quote+%26+availability]
Looking into [https://www.yelp.com/biz/lion-security-and-locksmith-los-angeles?osq=Security+Camera+Installation&override_cta=Request+quote+%26+availability]
Looking into [https://www.yelp.com/biz/david-av-installations-los-angeles?osq=Security+Camera+Installation&override_cta=Request+quote+%26+availability]
Looking into [https://www.yelp.com/biz/security-camera-solutions-burbank-5?osq=Security+Camera+Installation&override_cta=Request+quote+%26+availability]
Unable to get business url, skipping... Error: not enough values to unpack (expected 2, got 1). []
Looking into [https://www.yelp.com/biz/tci-security-cameras-downey-4?osq=Security+Camera+Installat

Unnamed: 0,business_name,business_url,phone_number
0,"TV Hang Ups, TV Mounting & Security Cameras",http://www.tvhangups.com,(213) 640-7330
1,Los Angeles Security Cameras,,(888) 518-6756
2,LA Tech Guys,http://Www.latechguyz.com,(310) 922-2208
3,Gbit Tech,http://www.GbitTech.net,(818) 809-6743
4,TCI Security Cameras,http://www.tci365.com,(562) 644-9339
5,Security Camera Solutions,,(213) 218-6934
6,David AV Installations,http://davidavinstallations.com,(213) 652-2321
7,Lion Security and Locksmith,http://lionsecurityla.com,(310) 930-8843
8,Three Factor Security Cameras Installation,https://threefactorsecurity.com,(310) 289-4964
9,Los Angeles CCTV Security Cameras,http://cctv-losangeles.com,(877) 429-9988


In [22]:
output_file_path:Path = Path('./data/yelp.csv')
if output_file_path.parent.exists() == False:
    output_file_path.parent.mkdir(parents=True, exist_ok=True)    

df.to_csv('data/yelp.csv', index=False)
print(f"[{len(df)}] Data saved to {output_file_path.as_posix()}")

[10] Data saved to data/yelp.csv
