# Imports

In [63]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Constant

In [25]:
PAGE_URL = 'https://www.zocdoc.com/search?address=New%20Jersey&offset=0'

# Model

In [38]:
class Doctor:
    def __init__(self, name=None, job_title=None, rating=None, location_name=None, street_address=None, 
                 address_locality=None, region=None, postal_code=None):
        self.name = name
        self.job_title = job_title
        self.rating = rating
        self.location_name = location_name
        self.street_address = street_address
        self.address_locality = address_locality
        self.region = region
        self.postal_code = postal_code

    def __str__(self):
        return (f"Dr Name: {self.name}\n"
                f"Job Title: {self.job_title}\n"
                f"Rating: {self.rating}\n"
                f"Location Name: {self.location_name}\n"
                f"Street Address: {self.street_address}\n"
                f"Address Locality: {self.address_locality}\n"
                f"Region: {self.region}\n"
                f"Postal Code: {self.postal_code}\n")

# Utils

In [41]:
def extract_page_articles(offset=0):
    # Page url
    page_url = f'https://www.zocdoc.com/search?address=New%20Jersey&offset={0}'
    
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    }

    # Set cookies
    cookies = {
        'firstTimeVisitor': '1e8d84b5-507c-4995-9b3d-ecceb5f5b7a0',
        'originalReferrer': 'NONE',
        'mostRecentReferrer': 'NONE',
        'ABG': '51d7cb0f-81e6-4de7-a312-f77bf654f713',
        'abfp': '1',
        'ASP.NET_SessionId': 'kx51dmlvaxdmmvmfzmphbcdj',
        'SearchLocation': '{}',
        'address_cookie': 'New%20Jersey',
        'hCarrierId': '-1',
        'hPlanId': '-1',
        'isDefaultHPlan': 'false',
        'previouslySearchedProcedureIdForTriage': '75',
        'AWSALB': 'HObpfc8Sa7mQJ8zG9jT0gQSuQt+dYCqS9Y19NgsQLGyle51juuYSb6Sr6IPwpVZ3X/zSI4fxdslkAgSTvroXPfDjKENCLZDoKs75FIL56y9dkWZ3r+dyo3LFaqOY',
        'AWSALBCORS': 'HObpfc8Sa7mQJ8zG9jT0gQSuQt+dYCqS9Y19NgsQLGyle51juuYSb6Sr6IPwpVZ3X/zSI4fxdslkAgSTvroXPfDjKENCLZDoKs75FIL56y9dkWZ3r+dyo3LFaqOY',
        'isNewPatient': '2024-05-21T16:56:40.858Z',
        'bsid': '34a56faf23ff41118512ab6899df08df_2405201729',
        'datadome': 'IjP62X1LUaqRPrUbS08MjZcqf3muuPlrruMbfuRw7POQRDrBvze~XpYpSuFgwMNSado4sxWT_6cOJXfu0347T0bHJbByvbCl2u4Jgpl8gBBMnkO0sR6iX70TFgf5pwQM',
        'lux_uid': '171622614456588604',
        'referrer_34a56faf23ff41118512ab6899df08df': 'www.zocdoc.com%2F',
    }


    # Send a GET request to the website with headers
    response = requests.get(page_url, headers=headers, cookies=cookies)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <article> tags with the attribute data-location-id
    articles = soup.find_all('article', attrs={'data-location-id': True})
    
    return articles


def extract_practioners_from_page_articles(articles):
    practioners = []

    for article in articles:
        dr_name_tag = article.find('a', class_='sc-eBwKMn cKgoOg', attrs={'data-test': 'doctor-card-info-name-full'})
        if not dr_name_tag:
            dr_name_tag = article.find('a', class_='sc-eBwKMn jpEUfH', attrs={'data-test': 'doctor-card-info-name-full'})

        dr_name = dr_name_tag.text.strip() if dr_name_tag else None

        try:
            job_title = article.find('div', class_='sc-fatcLD issRnF', itemprop='jobTitle').find('span').text.strip()
        except AttributeError:
            job_title = None

        try:
            rating = article.find('div', class_='sc-fe0b3-18 gbYLNz').text.strip()
        except AttributeError:
            rating = None

        try:
            location_name = article.find('span', {'data-test': 'doctor-card-info-location-name', 'itemprop': 'alternateName'}).text.strip()
        except AttributeError:
            location_name = None

        try:
            street_address = article.find('span', {'data-test': 'doctor-card-info-location-address', 'itemprop': 'streetAddress'}).text.strip()
        except AttributeError:
            street_address = None

        try:
            address_locality = article.find('span', {'data-test': 'doctor-card-info-location-city', 'itemprop': 'addressLocality'}).text.strip()
        except AttributeError:
            address_locality = None

        try:
            region = article.find('span', {'data-test': 'doctor-card-info-location-state', 'itemprop': 'addressRegion'}).text.strip()
        except AttributeError:
            region = None

        try:
            postal_code = article.find('span', {'data-test': 'doctor-card-info-location-zip', 'itemprop': 'postalCode'}).text.strip()
        except AttributeError:
            postal_code = None
            
        practioners.append(
            Doctor(
                name=dr_name,
                job_title=job_title,
                rating=rating,
                location_name=location_name,
                street_address=street_address,
                address_locality=address_locality,
                region=region,
                postal_code=postal_code
            )
        )
        
    return practioners

# Code

### Step 1: Scrape articles tag from every page

In [53]:
page_articles = []

for i in range(0,11):
    articles = extract_page_articles()
    page_articles.append(articles)

### Step 2: Iterate each page article and scrape practioners from it

In [57]:
practioners = []

for page_article in page_articles:
    practioners += extract_practioners_from_page_articles(page_article)

In [62]:
print(practioners[-1].__str__())

Dr Name: Dr. Elisabeth Mason, DO
Job Title: Primary Care Doctor
Rating: 4.80
Location Name: 40 Bey Lea Rd - Ste C103 -
Street Address: 40 Bey Lea Rd, Ste C103
Address Locality: Toms River
Region: NJ
Postal Code: 08701



### Step 3: Dump scraped data into pandas dataframe

In [65]:
# Convert the list of Doctor objects to a list of dictionaries
practioners_dicts = [practioner.__dict__ for practioner in practioners]

# Create a Pandas DataFrame from the list of dictionaries
df = pd.DataFrame(practioners_dicts)

In [66]:
df

Unnamed: 0,name,job_title,rating,location_name,street_address,address_locality,region,postal_code
0,"Dr. Angela Lee, DO",Primary Care Doctor,4.84,,,,,
1,"Gino Cortes, NP",Nurse Practitioner,5.00,Jackson Primary Care -,"260 N County Line Rd, Unit 112-113",Jackson,NJ,08527
2,"Dr. Kerollos Askander, MD",Family Physician,4.92,,"4013 Rt 9, Ste 1N",Howell,NJ,07731
3,"Dr. Heather Robberson, MD",Primary Care Doctor,4.94,475 Hwy 70 - Ste 104 -,"475 Hwy 70, Ste 104",Lakewood,NJ,08701
4,"Dr. Sneha Sreekumar, MD",Primary Care Doctor,4.97,,3520 US Highway 9,Freehold,NJ,07728
...,...,...,...,...,...,...,...,...
193,"Dr. Amrita Singh, MD",Primary Care Doctor,4.77,,,,,
194,"Dr. Tayyab Malik, MD",Primary Care Doctor,4.90,,"855 Valley Rd, Ste 112",Clifton,NJ,07013
195,"Dr. Bhavna Verma, MD",Internist,4.93,,"953 Fischer Blvd, Ste 2",Toms River,NJ,08755
196,"Dr. Melvina B Patel, MD",Primary Care Doctor,4.73,,,,,


In [67]:
df.to_csv('zocdoc_practioners_new_jersey.csv')