In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import time
import pandas as pd
import csv
import re

In [2]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [7]:
#pet_type = "cat"
pet_type = "dog"
url = "https://www.adoptapet.com/" + pet_type + "-adoption/search/50/miles/Chicago,%20IL"
browser.visit(url)

In [8]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# Determine how many pages of results by reading the pagination section at the bottom of the search results
num_pages = soup.find(attrs={"data-pagination": "pagination-pager"}).span.text
num_pages = re.findall(r"\d+$",num_pages)[0]
print("There are " + str(num_pages) + " pages of search results")
next_page_url = url + "#current_page="
#print(next_page_url)

There are 13 pages of search results


In [None]:
# Write results to a csv file for now
f = open('output_' + pet_type + '-adoption.csv','w')
fieldnames = ['pet_name', 'pet_type', 'sex', 'age', 'location', 'link',\
            'breed','color', 'size', 'weight', 'pet_id','hair','rescue','address', \
             'Spayed', 'Neutered', 'Spayed / Neutered', 'Shots Up to Date', \
              'Purebred', 'Has Special Needs', 'Declawed',\
              'Not Good with Kids', 'Good with Kids', 'House-trained', \
                'Good with Cats', 'Not Good with Cats', 'Good with Dogs', 'Not Good with Dogs',\
             'Needs Experienced Adopter']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()

In [None]:
# Recodes the cat/dog age to one of the age groups (kitten, puppy, young, adult, senior), if needed
age_groups = ["kitten","puppy","young","adult","senior"]
def assign_age_group (pet_type, age_string):
    if(pet_type == "cat"):
        vals = age_string.split()
        print(vals)
        if(vals[1] == "years"): # young, adult, or senior
            if(int(vals[0]) >= 7): # senior 7+ years
                return "senior"
            elif(int(vals[0]) > 3): # adult 3-6 years
                return "adult"
            else: # young 2 years
                return "young"
        else: # kitten <= 1 year
            return "kitten"
    elif(pet_type == "dog"):
        return age_string.split(",")[1].strip()

In [None]:
for page_num in range(1,int(num_pages)+1):
    
    next_page_url = url + "#current_page=" + str(page_num)
    print(next_page_url)
    
    browser.visit(next_page_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    time.sleep(15)
    # Retrieve all the pet cards on the current page
    results = soup.find_all('div', class_='pet-card')
    print("There are " + str(len(results)) + " results on Page " + str(page_num) + " of " + str(num_pages))
    
    # Loop through each page of results. There are up to 40 results per page.
    for result in results:
        try:        
            # Check that pet page still exists
            if(result.find('div', class_='pet-error')):
                print(result.a['href'])
                print(result.find('div', class_='pet-error').div.h3.text)
                pass
            else: 
                #print(result)
                pet_name = result.find(attrs={"data-pet-card": "pet-card-heading"}).text.strip()
                # Format location name so that only the first letter of city is capitalized
                location = result.find(attrs={"data-pet-card": "city"}).text.strip().lower().capitalize() \
                            + ", " + result.find(attrs={"data-pet-card": "state"}).text.strip()
                link = result.a['href']  
                #print(link)

                # visit the link to the pet page
                browser.visit(link)
                p_html = browser.html
                p_soup = BeautifulSoup(p_html, 'html.parser')

                # Store information into a dictionary
                pet_card = {
                    'pet_name': pet_name,
                    'pet_type' : pet_type,
                    'sex': "",
                    'age': "",
                    'location': location,
                    'link': link,
                    'breed' : "",
                    'color' : "",
                    'size' : "",  # dogs
                    'weight' : "", # dogs
                    'pet_id' : "",
                    'hair' : "",  # cats
                    'rescue' : "",
                    'address' : ""
                }
                
                # Gather info from Facts About Me section
                p_facts_section = p_soup.find_all(attrs={"data-pet-detail": "pet-facts-content-section"})

                for item in p_facts_section:
                    label = item.find(attrs={"data-pet-detail": "pet-facts-label"})
                    value = item.find('div', class_="h4--light")
                    if(value):        
                        label_str = label.text.strip().lower().replace(" ", "_")
                        pet_card[label_str] = value.text.strip()

                # Rescue or Private Owner
                shelterinfo_label = p_soup.find('h5', class_='shelterinfo__label').text.strip()
                if(shelterinfo_label == 'Rescue'):
                    # If it's a rescue, get shelter name and location
                    pet_card['rescue'] = p_soup.find('h1', class_='shelterinfo__header').text.strip()

                    if(p_soup.find('div', class_='gtm-plain-text-address')):
                        pet_card['address'] = p_soup.find('div', class_='gtm-plain-text-address').text.strip()
                    else:
                        pet_card['address'] = p_soup.find('a', class_='gtm-shelter-map').text.strip()
                else:
                    # If it's a private owner, pet_rescue is "Private Owner" and pet_address is location
                    pet_card['rescue'] = "Private Owner" #p_soup.find('h1', class_='shelterinfo__header').text.strip()
                    pet_card['address'] = location

                # Gather info from My Info section, if available
                p_info = p_soup.find(attrs={"data-pet-detail": "myinfo-content"})
                if(p_info):
                    p_info_list = p_info.find_all(attrs={"data-h4": "heading-compact"})
                    for item in p_info_list:
                        pet_card[item.text] = 'Yes'   
                writer.writerow(pet_card)
                
            pet_card.clear()

        except Exception as err:
            print(result.a['href'])
            print(err)
            pass

In [None]:
f.close()

In [None]:
data = pd.read_csv("output_.csv") 
data = data.fillna("")

In [None]:
data

<b> The following was used for testing purposes.</b>

In [12]:
# Testing: Retrieving information for first one
# pet_name = results[0].find('h4', class_="pet-card__heading").text.strip()
# sex = results[0].find_all('span', class_="pet-card__content--comma")[0].text.strip()
# age = results[0].find_all('span', class_="pet-card__content--comma")[1].text.strip()
# location = results[0].find_all('span', class_="pet-card__content--comma")[2].text \
#             + ", " + results[0].find_all('span', class_="pet-card__content--comma")[3].text

next_page_url = url + "#current_page=" + "5"
print(next_page_url)

browser.visit(next_page_url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

pet_name = soup.find(attrs={"data-pet-card": "pet-card-heading"}).text.strip()
sex = soup.find(attrs={"data-pet-card": "sex"}).text.strip()
age = soup.find(attrs={"data-pet-card": "age"}).text.strip()
location = soup.find(attrs={"data-pet-card": "city"}).text.strip().lower().capitalize() \
            + ", " + soup.find(attrs={"data-pet-card": "state"}).text.strip()
#link = results[0].a['href']    
print(pet_name)
print(sex)
print(age)
print(location)
print(link)

https://www.adoptapet.com/dog-adoption/search/50/miles/Chicago,%20IL#current_page=5
Violet
Female
adult
Chicago, IL


In [13]:
# Testing: Visit the link to the pet page and retrieve information
#link = "https://www.adoptapet.com/pet/24939053-chicago-illinois-cat"
#link = "https://www.adoptapet.com/pet/25520742-tinley-park-illinois-kitten"
link = "https://www.adoptapet.com/pet/23911259-evergreen-park-illinois-mastiff-mix"
browser.visit(link)
p_html = browser.html
p_soup = BeautifulSoup(p_html, 'html.parser')

if(p_soup.find('div', class_='pet-error')):
    print(p_soup.find('div', class_='pet-error').div.h3.text)
else :    
    pet_name = p_soup.find(attrs={"data-pet-detail": "pet-header-pet-heading"}).span.text

    # Store information into a dictionary
    pet_card = {
        'pet_name': pet_name,
        'sex': "",
        'age': "",
        'location': location,
        'link': link,
        'breed' : "",
        'color' : "",
        'size' : "",  # dogs
        'weight' : "", # dogs
        'pet_id' : "",
        'hair' : "",  # cats
        'rescue' : "",
        'address' : ""
    }
    #p_facts = p_soup.find_all('div', class_='pet-facts__content')
    #print(p_facts)
    p_facts_section = p_soup.find_all(attrs={"data-pet-detail": "pet-facts-content-section"})
    
    for item in p_facts_section:
        label = item.find(attrs={"data-pet-detail": "pet-facts-label"})
        value = item.find('div', class_="h4--light")
        if(value):        
            label_str = label.text.strip().lower().replace(" ", "_")
            pet_card[label_str] = value.text.strip()

    # Rescue or Private Owner
    shelterinfo_label = p_soup.find('h5', class_='shelterinfo__label').text.strip()
    if(shelterinfo_label == 'Rescue'):
        # If it's a rescue, get shelter name and location
        pet_card['rescue'] = p_soup.find('h1', class_='shelterinfo__header').text.strip()

        if(p_soup.find('div', class_='gtm-plain-text-address')):
            pet_card['address'] = p_soup.find('div', class_='gtm-plain-text-address').text.strip()
        else:
            pet_card['address'] = p_soup.find('a', class_='gtm-shelter-map').text.strip()
    else:
        # If it's a private owner, pet_rescue is "Private Owner" and pet_address is location
        pet_card['rescue'] = "Private Owner" #p_soup.find('h1', class_='shelterinfo__header').text.strip()
        pet_card['address'] = location

    #print(pet_card)
    p_info = p_soup.find(attrs={"data-pet-detail": "myinfo-content"})
    #print(p_info)
    if(p_info):
        p_info_list = p_info.find_all(attrs={"data-h4": "heading-compact"})
        for item in p_info_list:
            pet_card[item.text] = 'Yes'
            #print(item.text)

    # Print pet dictionary
    print(pet_card)

pet_card.clear()

{'pet_name': 'Lil Dude', 'sex': 'Male', 'age': 'adult', 'location': 'Chicago, IL', 'link': 'https://www.adoptapet.com/pet/23911259-evergreen-park-illinois-mastiff-mix', 'breed': 'Mastiff/American Staffordshire Terrier Mix', 'color': 'White - with Tan, Yellow or Fawn', 'size': '(when grown) Large 61-100 lbs (28-45 kg)', 'weight': '', 'pet_id': '607462', 'hair': '', 'rescue': 'Private Owner', 'address': 'Chicago, IL', 'Neutered': 'Yes', 'Shots Up to Date': 'Yes', 'House-trained': 'Yes', 'Good with Dogs': 'Yes', 'Good with Kids': 'Yes'}
