In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random

# Function to extract data from the page
def extract_company_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {url}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    companies = []
    
    # Loop over the companies on Yellow Pages
    company_list = soup.find_all('div', class_='result')
    
    for company in company_list:
        company_data = {}
        
        try:
            company_data['Company Name'] = company.find('a', class_='business-name').get_text(strip=True)
        except AttributeError:
            company_data['Company Name'] = None
        
        try:
            company_data['Website URL'] = company.find('a', class_='track-visit-website')['href']
        except (AttributeError, TypeError):
            company_data['Website URL'] = None
        
        try:
            company_data['Contact Number'] = company.find('div', class_='phones phone primary').get_text(strip=True)
        except AttributeError:
            company_data['Contact Number'] = None
        
        try:
            company_data['Location/Address'] = company.find('div', class_='street-address').get_text(strip=True)
        except AttributeError:
            company_data['Location/Address'] = None
        
        try:
            company_data['Industry/Category'] = "IT Services"  # Based on search query
        except AttributeError:
            company_data['Industry/Category'] = None
        
        try:
            company_data['Company Description'] = company.find('div', class_='snippet').get_text(strip=True)
        except AttributeError:
            company_data['Company Description'] = None
        
        try:
            company_data['Email Address'] = None  # Email is rarely available directly on Yellow Pages
        except AttributeError:
            company_data['Email Address'] = None
        
        companies.append(company_data)
    
    return companies

# Function to save data in a CSV file
def save_to_csv(companies, filename="company_data.csv"):
    fieldnames = ['Company Name', 'Website URL', 'Contact Number', 'Location/Address', 'Industry/Category', 'Company Description', 'Email Address']
    
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for company in companies:
            writer.writerow(company)

# Main scraping function
def scrape_yellowpages(base_url, num_pages=3):
    all_companies = []
    
    for page in range(1, num_pages + 1):
        url = f"{base_url}&page={page}"
        print(f"Scraping page {page}: {url}")
        
        companies = extract_company_data(url)
        
        if companies:
            all_companies.extend(companies)
        
        # Sleep between requests to avoid being blocked
        time.sleep(random.uniform(2, 5))
    
    # Save the data to a CSV file
    save_to_csv(all_companies)

# Example usage
base_url = "https://www.yellowpages.com/search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY"
scrape_yellowpages(base_url)


Scraping page 1: https://www.yellowpages.com/search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=1
Scraping page 2: https://www.yellowpages.com/search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=2
Scraping page 3: https://www.yellowpages.com/search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=3
