In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import time
import pandas as pd
import csv
import re

In [2]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
url = "https://www.adoptapet.com/cat-adoption/search/50/miles/60622"
browser.visit(url)

In [4]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Determine how many pages of results by reading the pagination section at the bottom of the search results
num_pages = soup.find(attrs={"data-pagination": "pagination-pager"}).span.text
num_pages = re.findall(r"\d+$",num_pages)[0]
print("There are " + str(num_pages) + " pages of search results")
next_page_url = url + "#current_page="
#print(next_page_url)

There are 13 of search results


In [6]:
# Write results to a csv file for now
f = open('output_cats_june10.csv','w')
fieldnames = ['pet_name', 'pet_gender', 'pet_age', 'location', 'link',\
            'pet_breed','pet_color','pet_id','pet_hair','pet_rescue','pet_address', \
             'Spayed', 'Neutered', 'Shots Up to Date', 'Has Special Needs',\
              'Not Good with Kids', 'Good with Kids', 'House-trained', \
                'Good with Cats', 'Not Good with Cats', 'Good with Dogs', 'Not Good with Dogs',\
             'Needs Experienced Adopter']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()

In [9]:
for page_num in range(1,int(num_pages)+1):

    next_page_url = url + "#current_page=" + str(page_num)
    print(next_page_url)
    
    browser.visit(next_page_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Retrieve all the pet cards on the current page
    results = soup.find_all('div', class_='pet-card')
    print("There are " + str(len(results)) + " results on Page " + str(page_num) + " of " + str(num_pages))

    # Loop through first page of results. There are up to 40 results per page.
    for result in results:
        try:        
            pet_name = result.find(attrs={"data-pet-card": "pet-card-heading"}).text.strip()
            gender = result.find(attrs={"data-pet-card": "sex"}).text.strip()
            age = result.find(attrs={"data-pet-card": "age"}).text.strip()
            location = result.find(attrs={"data-pet-card": "city"}).text.strip() \
                        + ", " + result.find(attrs={"data-pet-card": "state"}).text.strip()
            link = result.a['href']  
            print(link)

            # visit the link to the pet page
            browser.visit(link)
            p_html = browser.html
            p_soup = BeautifulSoup(p_html, 'html.parser')

            p_facts = p_soup.find_all('div', class_='pet-facts__content')
            # Gather info from Facts About Me section
            pet_breed = p_facts[0].find_all('div', class_="h4--light")[0].text
            pet_color = p_facts[0].find_all('div', class_="h4--light")[1].text
            pet_age = p_facts[0].find_all('div', class_="h4--light")[2].span.text
            pet_gender = p_facts[0].find_all('div', class_="h4--light")[3].text
            pet_id = p_facts[0].find_all('div', class_="h4--light")[4].text
            pet_hair = p_facts[0].find_all('div', class_="h4--light")[5].text

            # Rescue or Private Owner
            shelterinfo_label = p_soup.find('h5', class_='shelterinfo__label').text.strip()
            if(shelterinfo_label == 'Rescue'):
                # If it's a rescue, get shelter name and location
                pet_rescue = p_soup.find('a', class_='formgroup__value').text
                if(p_soup.find('div', class_='gtm-plain-text-address')):
                    pet_address = p_soup.find('div', class_='gtm-plain-text-address').text.strip()
                else:
                    pet_address = p_soup.find('a', class_='gtm-shelter-map').text.strip()
            else:
                # If it's a private owner, pet_rescue is none and pet_address is location
                pet_rescue = p_soup.find('h1', class_='shelterinfo__header').text.strip()
                pet_address = location

            pet_card = {
                'pet_name': pet_name,
                'pet_gender': pet_gender,
                'pet_age': pet_age,
                'location': location,
                'link': link,
                'pet_breed' : pet_breed,
                'pet_color' : pet_color,
                'pet_id' : pet_id,
                'pet_hair' : pet_hair,
                'pet_rescue' : pet_rescue,
                'pet_address' : pet_address
            }

            # Gather info from My Info section if available
            p_info = p_soup.find(attrs={"data-pet-detail": "myinfo-content"})
            if(p_info):
                p_info_list = p_info.find_all(attrs={"data-h4": "heading-compact"})
                for item in p_info_list:
                    pet_card[item.text] = 'Yes'   

            writer.writerow(pet_card)
            pet_card.clear()

        except Exception as err:
            print(err)
            pass

https://www.adoptapet.com/cat-adoption/search/50/miles/60622#current_page=1
There are 40 results on Page 1 of 13
https://www.adoptapet.com/pet/25092297-chicago-illinois-cat
https://www.adoptapet.com/pet/24747369-chicago-illinois-cat
https://www.adoptapet.com/pet/24939053-chicago-illinois-cat
https://www.adoptapet.com/pet/25307385-chicago-illinois-cat
https://www.adoptapet.com/pet/25307384-chicago-illinois-cat
https://www.adoptapet.com/pet/23942811-chicago-illinois-cat
https://www.adoptapet.com/pet/25274523-chicago-illinois-cat
https://www.adoptapet.com/pet/25471074-chicago-illinois-cat
https://www.adoptapet.com/pet/24654740-chicago-illinois-cat
https://www.adoptapet.com/pet/24654762-chicago-illinois-kitten
https://www.adoptapet.com/pet/24654774-chicago-illinois-cat
https://www.adoptapet.com/pet/21239179-chicago-illinois-cat
https://www.adoptapet.com/pet/21239139-chicago-illinois-cat
https://www.adoptapet.com/pet/21269354-chicago-illinois-cat
https://www.adoptapet.com/pet/20947327-chica

IndexError: list index out of range

In [None]:
f.close()

<b> The following was used for testing purposes.</b>

In [None]:
# Testing: Retrieving information for first one
# pet_name = results[0].find('h4', class_="pet-card__heading").text.strip()
# gender = results[0].find_all('span', class_="pet-card__content--comma")[0].text.strip()
# age = results[0].find_all('span', class_="pet-card__content--comma")[1].text.strip()
# location = results[0].find_all('span', class_="pet-card__content--comma")[2].text \
#             + ", " + results[0].find_all('span', class_="pet-card__content--comma")[3].text

pet_name = soup.find(attrs={"data-pet-card": "pet-card-heading"}).text.strip()
gender = soup.find(attrs={"data-pet-card": "sex"}).text.strip()
age = soup.find(attrs={"data-pet-card": "age"}).text.strip()
location = soup.find(attrs={"data-pet-card": "city"}).text.strip() \
            + ", " + soup.find(attrs={"data-pet-card": "state"}).text.strip()
link = results[0].a['href']    
print(pet_name)
print(gender)
print(age)
print(location)
print(link)

In [None]:
# Testing: Visit the link to the pet page and retrieve information
#link = "https://www.adoptapet.com/pet/24939053-chicago-illinois-cat"
#link = "https://www.adoptapet.com/pet/24038702-chicago-illinois-cat"
#link = "https://www.adoptapet.com/pet/16859197-chicago-illinois-cat"
link = "https://www.adoptapet.com/pet/25367945-cicero-illinois-cat"
browser.visit(link)
p_html = browser.html
p_soup = BeautifulSoup(p_html, 'html.parser')

pet_name = p_soup.find(attrs={"data-pet-detail": "pet-header-pet-heading"}).span.text

p_facts = p_soup.find_all('div', class_='pet-facts__content')
#print(p_facts)
pet_breed = p_facts[0].find_all('div', class_="h4--light")[0].text
pet_color = p_facts[0].find_all('div', class_="h4--light")[1].text
pet_age = p_facts[0].find_all('div', class_="h4--light")[2].span.text
pet_gender = p_facts[0].find_all('div', class_="h4--light")[3].text
pet_id = p_facts[0].find_all('div', class_="h4--light")[4].text
pet_hair = p_facts[0].find_all('div', class_="h4--light")[5].text

# Rescue or Private Owner
shelterinfo_label = p_soup.find('h5', class_='shelterinfo__label').text.strip()
if(shelterinfo_label == 'Rescue'):
    # If it's a rescue, get shelter name and location
    pet_rescue = p_soup.find('h1', class_='shelterinfo__header').text.strip()
    
    if(p_soup.find('div', class_='gtm-plain-text-address')):
        pet_address = p_soup.find('div', class_='gtm-plain-text-address').text.strip()
    else:
        pet_address = p_soup.find('a', class_='gtm-shelter-map').text.strip()
else:
    # If it's a private owner, pet_rescue is none and pet_address is location
    pet_rescue = p_soup.find('h1', class_='shelterinfo__header').text.strip()
    pet_address = location

# Store information into a dictionary
pet_card = {
    'pet_name': pet_name,
    'pet_gender': pet_gender,
    'pet_age': pet_age,
    'location': location,
    'link': link,
    'pet_breed' : pet_breed,
    'pet_color' : pet_color,
    'pet_id' : pet_id,
    'pet_hair' : pet_hair,
    'pet_rescue' : pet_rescue,
    'pet_address' : pet_address
}
#print(pet_card)
p_info = p_soup.find(attrs={"data-pet-detail": "myinfo-content"})
#print(p_info)
if(p_info):
    p_info_list = p_info.find_all(attrs={"data-h4": "heading-compact"})
    for item in p_info_list:
        pet_card[item.text] = 'Yes'
        #print(item.text)

# Print pet dictionary
print(pet_card)