Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import json

Sporting Group List

In [25]:
urls = [
    "https://www.akc.org/dog-breeds/sporting/",
    "https://www.akc.org/dog-breeds/sporting/page/2/",
    "https://www.akc.org/dog-breeds/sporting/page/3/"
]

dog_breeds = []

#Scrape page header title and description
response = requests.get(urls[0])
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    description = soup.find("div", class_="read-more__long").p.text.strip()
    page_header_title = soup.find("h1", class_="page-header__title").text.strip()
else:
    print(f"Failed to retrieve the web page ({urls[0]}). Status code: {response.status_code}")

# Scrape the list of dog breeds from all pages
for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        dog_breeds.extend([breed.text.strip() for breed in soup.find_all("h3", class_="breed-type-card__title")])
    else:
        print(f"Failed to retrieve the web page ({url}). Status code: {response.status_code}")

# Create a dictionary to store the scraped information
scraped_data = {
    "page_header_title": page_header_title,
    "description": description,
    "dog_breeds": dog_breeds
}

# Convert the dictionary to JSON format
json_data = json.dumps(scraped_data, indent=4)

# Print the JSON data
print(json_data)

{
    "page_header_title": "Sporting Group",
    "description": "Naturally active and alert, Sporting dogs make likeable, well-rounded companions. First developed to work closely with hunters to locate and/or retrieve quarry. There are four basic types of Sporting dogs; spaniels, pointers, retrievers and setters. Known for their superior instincts in water and woods, many of these breeds enjoy hunting and other field activities. Many of them, especially the water-retrieving breeds, have well \u2013insulated water repellant coats, which are quite resilient to the elements. Thinking of getting one? Just realize that most require regular, invigorating exercise.",
    "dog_breeds": [
        "American Water Spaniel",
        "Barbet",
        "Boykin Spaniel",
        "Bracco Italiano",
        "Brittany",
        "Chesapeake Bay Retriever",
        "Clumber Spaniel",
        "Cocker Spaniel",
        "Curly-Coated Retriever",
        "English Cocker Spaniel",
        "English Setter",
   

Hound Group

In [6]:
urls = [
    "https://www.akc.org/dog-breeds/hound/",
    "https://www.akc.org/dog-breeds/hound/page/2/",
    "https://www.akc.org/dog-breeds/hound/page/3/"
]

dog_breeds = []

#Scrape page header title and description
response = requests.get(urls[0])
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    description = soup.find("div", class_="read-more__long").p.text.strip()
    page_header_title = soup.find("h1", class_="page-header__title").text.strip()
else:
    print(f"Failed to retrieve the web page ({urls[0]}). Status code: {response.status_code}")

# Scrape the list of dog breeds from all pages
for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        dog_breeds.extend([breed.text.strip() for breed in soup.find_all("h3", class_="breed-type-card__title")])
    else:
        print(f"Failed to retrieve the web page ({url}). Status code: {response.status_code}")

# Create a dictionary to store the scraped information
scraped_data = {
    "page_header_title": page_header_title,
    "description": description,
    "dog_breeds": dog_breeds
}

# Convert the dictionary to JSON format
json_data = json.dumps(scraped_data, indent=4)

# Print the JSON data
print(json_data)

{
    "page_header_title": "Hound Group",
    "description": "Most hounds share the common ancestral trait of being used for hunting. Some use acute scenting powers to follow a trail. Others demonstrate a phenomenal gift of stamina as they relentlessly run down quarry. Beyond this, however, generalizations about hounds are hard to come by, since the Group encompasses quite a diverse lot. There are Pharaoh Hounds, Norwegian Elkhounds, Afghans and Beagles, among others. Some hounds share the distinct ability to produce a unique sound known as baying. You'd best sample this sound before you decide to get a hound of your own to be sure it's your cup of tea.",
    "dog_breeds": [
        "Afghan Hound",
        "American English Coonhound",
        "American Foxhound",
        "Azawakh",
        "Basenji",
        "Basset Hound",
        "Beagle",
        "Black and Tan Coonhound",
        "Bloodhound",
        "Bluetick Coonhound",
        "Borzoi",
        "Cirneco dell\u2019Etna",
      

In [31]:
def scrape_akc_data(group_names):
    all_data = []

    for group_name in group_names:
        # Fetch description
        response = requests.get(f"https://www.akc.org/dog-breeds/{group_name}/")
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            description = soup.find("div", class_="read-more__long").p.text.strip()
        else:
            print(f"Failed to retrieve the web page ({group_name}). Status code: {response.status_code}")
            description = "Description not available."

        # Fetch dog breeds
        dog_breeds = []
        for page_num in range(1, 4):
            url = f"https://www.akc.org/dog-breeds/{group_name}/page/{page_num}/" if page_num > 1 else f"https://www.akc.org/dog-breeds/{group_name}/"
            response = requests.get(url)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                breed_tiles = soup.find_all("h3", class_="breed-type-card__title")
                if not breed_tiles:
                    break  # No more breeds on the page

                dog_breeds.extend([breed.text.strip() for breed in breed_tiles])
            elif response.status_code == 404:
                print(f"Page not found: {url}")
            else:
                print(f"Failed to retrieve the web page ({url}). Status code: {response.status_code}")
                break

        # Create a dictionary for this group
        group_data = {
            "page_header_title": f"{group_name.capitalize()} Group",
            "description": description,
            "dog_breeds": dog_breeds
        }

        all_data.append(group_data)

    # Save the combined data to a JSON file
    with open("akc_breed_groups.json", "w") as json_file:
        json.dump(all_data, json_file, indent=4)

In [32]:
# List of group names
group_names = ['sporting', 'hound', 'working', 'terrier', 'toy', 'non-sporting', 'herding', 
               'miscellaneous-class', 'smallest-dog-breeds', 'medium-dog-breeds', 'largest-dog-breeds',
               'smartest-dogs', 'hypoallergenic-dogs', 'best-family-dogs', 'best-guard-dogs', 
               'best-dogs-for-kids', 'best-dogs-for-apartment-dwellers', 'hairless-dog-breeds']

# Scrape data for all groups and save to JSON
scrape_akc_data(group_names)

Page not found: https://www.akc.org/dog-breeds/toy/page/3/
Page not found: https://www.akc.org/dog-breeds/non-sporting/page/3/
Page not found: https://www.akc.org/dog-breeds/miscellaneous-class/page/2/
Page not found: https://www.akc.org/dog-breeds/miscellaneous-class/page/3/
Page not found: https://www.akc.org/dog-breeds/hairless-dog-breeds/page/2/
Page not found: https://www.akc.org/dog-breeds/hairless-dog-breeds/page/3/
