In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

In [2]:
city = ['Dallas', 'Richardson', 'San Jose']
state = ['TX', 'TX', 'CA']
r = requests.get('https://www.yellowpages.com/search?search_terms=Indian+Restaurants&geo_location_terms=Dallas%2C+TX')

In [3]:
soup = BeautifulSoup(r.content, 'lxml')

In [4]:
hrefs = []
texts = []
count = 0
for link in soup.find_all("a"):
    try:
        hrefs.append(link['href'])
        texts.append(link.text)
    except:
        count += 1

In [5]:
info_data = soup.find_all("div", {"class":"info"})

In [6]:
def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

In [7]:
def get_address_info(soup, category):
    business_name = street_address = locality = region = postal_code = phone_number = ""
    info_data = soup.find_all("div", {"class":"info"})
    data = []
    for item in info_data:
        business_name = (item.contents[0].find_all("a", {'class':'business-name'})[0].text)
        try:
            street_address = item.contents[1].find_all("span", {'itemprop':'streetAddress'})[0].text
        except:
            street_address = None
        try:
            locality = item.contents[1].find_all("span", {'itemprop':'addressLocality'})[0].text.replace(',','')
        except:
            locality = None
        try:
            region = item.contents[1].find_all("span", {'itemprop':'addressRegion'})[0].text
        except:
            region = None
        try:
            postal_code = item.contents[1].find_all("span", {'itemprop':'postalCode'})[0].text
        except:
            postal_code = None
        try:
            phone_number = item.contents[1].find_all("div", {"class":"primary"})[0].text
        except:
            phone_number = None
        data.append({'Business':business_name, 'Street':street_address, 'Locality':locality, 'Region':region, 
                     'Postal Code':postal_code, 'Phone Number':phone_number, 'Category':category})
    df = pd.DataFrame(data)
    return df

In [8]:
def get_additional_page_info(objSoup, category):
    addresses = []
    pages = objSoup.find_all("div", {"class":"pagination"})
    urls = pages[0].find_all("a")[:-1]
    for url in urls:
        soup_url = 'https://www.yellowpages.com' + url.get("href")
        soup_obj = get_soup(soup_url)
        addresses.append(get_address_info(soup_obj, category))
        print(soup_url)
        time.sleep(1)
    return addresses

In [11]:
cities = ['Dallas', 'Richardson', 'San%20Jose']
states = ['TX', 'TX', 'CA']
search_terms = ['Indian+Restaurants', 'Walmart']

frames = []

for term in search_terms:
    for city, state in list(zip(cities, states)):
        url = 'https://www.yellowpages.com/search?search_terms=' + term + '&geo_location_terms='+ city +',' + state
        soup = get_soup(url);
        frames.append(get_address_info(soup, category=term))
        for frame in get_additional_page_info(soup, category=term):
            frames.append(frame)
        print(url)
        time.sleep(1)
        
df = pd.concat(frames)

https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Dallas%2CTX&page=2
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Dallas%2CTX&page=3
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Dallas%2CTX&page=4
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Dallas%2CTX&page=5
https://www.yellowpages.com/search?search_terms=Indian+Restaurants&geo_location_terms=Dallas,TX
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Richardson%2CTX&page=2
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Richardson%2CTX&page=3
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Richardson%2CTX&page=4
https://www.yellowpages.com/search?search_terms=Indian%20Restaurants&geo_location_terms=Richardson%2CTX&page=5
https://www.yellowpages.com/sear

In [12]:
df.shape

(825, 7)

In [13]:
df.to_csv('Yellow-Pages-Scrape.csv')