<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#1.-Create-a-list-of-dog-breed's-name-and-their-AKC-webpage" data-toc-modified-id="1.-Create-a-list-of-dog-breed's-name-and-their-AKC-webpage-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>1. Create a list of dog breed's name and their AKC webpage</a></span></li><li><span><a href="#2.-Get-information-of-each-specific-dog-breed" data-toc-modified-id="2.-Get-information-of-each-specific-dog-breed-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>2. Get information of each specific dog breed</a></span></li></ul></div>

## 1. Create a list of dog breed's name and their AKC webpage

In [1]:
# Requirements (use pip to install)
# requests, csv, pandas, bs4, selenium==4.0.0, webdriver-manager

In [2]:
import requests  # for getting html
import time, os, csv
import pandas as pd  
from selenium import webdriver  # for rendering webpage with javascript 
from selenium.webdriver.chrome.service import Service  # webdriver
from webdriver_manager.chrome import ChromeDriverManager  # webddriver
from bs4 import BeautifulSoup  # for web parsing

In [5]:
# Get AKC main page to a string
req = requests.get('https://www.akc.org')
akc_main = req.text


In [6]:
# Convert webpage string to Beutifulsoup object to find the name and link of each breed
soup_main = BeautifulSoup(akc_main, 'html.parser')
breeds = soup_main.find_all('option')  # All breeds' info have the <option> tag
breed_page = {}  # Dictionary key = breed's name, value = breed's page
for dog in breeds[:284]:  # There are total of 283 different types of breeds
    page = dog.get('value')
    name = dog.text
    if name != 'Select A Breed':  # Ignore first element 
        breed_page[name] = page
for name, page in breed_page.items():
    print(name, page)

Affenpinscher https://www.akc.org/dog-breeds/affenpinscher/
Afghan Hound https://www.akc.org/dog-breeds/afghan-hound/
Airedale Terrier https://www.akc.org/dog-breeds/airedale-terrier/
Akita https://www.akc.org/dog-breeds/akita/
Alaskan Klee Kai https://www.akc.org/dog-breeds/alaskan-klee-kai/
Alaskan Malamute https://www.akc.org/dog-breeds/alaskan-malamute/
American Bulldog https://www.akc.org/dog-breeds/american-bulldog/
American English Coonhound https://www.akc.org/dog-breeds/american-english-coonhound/
American Eskimo Dog https://www.akc.org/dog-breeds/american-eskimo-dog/
American Foxhound https://www.akc.org/dog-breeds/american-foxhound/
American Hairless Terrier https://www.akc.org/dog-breeds/american-hairless-terrier/
American Leopard Hound https://www.akc.org/dog-breeds/american-leopard-hound/
American Staffordshire Terrier https://www.akc.org/dog-breeds/american-staffordshire-terrier/
American Water Spaniel https://www.akc.org/dog-breeds/american-water-spaniel/
Anatolian Shep

## 2. Get information of each specific dog breed

In [7]:
# Create an empty dataframe to store data
traits = ['Affectionate With Family', 'Good With Young Children', 'Good With Other Dogs', 
          'Shedding Level', 'Coat Grooming Frequency', 'Drooling Level', 'Coat Type', 'Coat Length', 
          'Openness To Strangers', 'Playfulness Level', 'Watchdog/Protective Nature', 'Adaptability Level', 
          'Trainability Level', 'Energy Level', 'Barking Level', 'Mental Stimulation Needs']
headers = ['Name', 'Height', 'Weight', 'Life Expectancy'] + traits
df_all_breeds = pd.DataFrame(columns = headers)
#print(df_all_breeds)

Empty DataFrame
Columns: [Name, Height, Weight, Life Expectancy, Affectionate With Family, Good With Young Children, Good With Other Dogs, Shedding Level, Coat Grooming Frequency, Drooling Level, Coat Type, Coat Length, Openness To Strangers, Playfulness Level, Watchdog/Protective Nature, Adaptability Level, Trainability Level, Energy Level, Barking Level, Mental Stimulation Needs]
Index: []


In [8]:
# Create an empty csv file with headers to store data
file = 'Sample_data.csv' 
if not os.path.isfile(file):
    with open(file, 'w', newline='') as fileout:
        writer = csv.writer(fileout)
        writer.writerow(headers)

In [9]:
# Function to find height, weight, life expectancy
def findAttributes(page):  #page needs to be a string
    res = []
    for attr in ['Height: ', 'Weight: ', 'Life Expectancy: ']:
        s = page.find(attr)
        e = page.find('<', s)
        res.append(page[s + len(attr):e])
    return res

In [20]:
# Check if the file already has some data
df = pd.read_csv(file, usecols=['Name'])
dogs = df['Name'].values.tolist()
#print(dogs)

# Iterate all breads and perform web scrape 
# Use selenium to scrape Javascript website in Chrome and extract complete HTML
count = 0
for dog, webpage in breed_page.items():
    if dog in dogs:  # skip if we already have this breed's info
        continue
    count += 1
    start_time = time.time()
    res = [dog, ]
    
    options = webdriver.ChromeOptions()  # create a option for chrome
    options.headless = True  # use headless browser
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = options)
    #driver = webdriver.Chrome(options = options)
    driver.get(webpage) # extract complete HTML to driver.page_source
    time.sleep(0.5)
    
    # Get Height, Weight and Life Expectancy
    soup = BeautifulSoup(driver.page_source, 'lxml')
    attributes = findAttributes(str(soup))
    res.extend(attributes)

    # Find trait scores
    tag = soup.find_all('div', class_ = "accordion__header__content")
    for e in tag:
        # For coat type and coat length, search for check mark
        check_mark = e.find('div', class_ = 'breed-trait-score__choice--selected')
        if check_mark:
            type_ = check_mark.find('span')
            if type_:
                res.append(type_.text)
            continue
            
        # For other traits, search for score-unit--filled (score bars)
        s = e.find_all('div', class_= 'breed-trait-score__score-unit breed-trait-score__score-unit--filled')
        score = len(s)
        res.append(score)
        
    # Handle errors (can also use autofill instead)
    if len(res) < 20:
        print(res)
        print('Invalid data for {}'.format(dog))
        break
        
    # Append new data to dataframe    
    new_data = pd.Series(res, index = df_all_breeds.columns)
    df_all_breeds = df_all_breeds.append(new_data, ignore_index=True)
    
    # Write data to csv file
    with open(file, 'a', newline = '') as fileout:
        writer = csv.writer(fileout)
        writer.writerow(res)

    print('No.{} {} completed after {} sec'.format(count, dog, round(time.time() - start_time, 3)))

driver.quit()




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/cathychen/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No.1 Affenpinscher completed after 28.229 sec



[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/cathychen/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No.2 Afghan Hound completed after 25.89 sec



[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/cathychen/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No.3 Airedale Terrier completed after 26.803 sec



[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/cathychen/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No.4 Akita completed after 93.707 sec



[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/cathychen/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No.5 Yorkshire Terrier completed after 24.619 sec


In [None]:
df_all_breeds

In [1]:
ls

Sample_data.csv    Web_Crawler.ipynb


In [16]:
df = pd.read_csv('Sample_data.csv')

In [17]:
df

Unnamed: 0,Name,Height,Weight,Life Expectancy,Affectionate With Family,Good With Young Children,Good With Other Dogs,Shedding Level,Coat Grooming Frequency,Drooling Level,Coat Type,Coat Length,Openness To Strangers,Playfulness Level,Watchdog/Protective Nature,Adaptability Level,Trainability Level,Energy Level,Barking Level,Mental Stimulation Needs
0,Affenpinscher,9-11.5 inches,7-10 pounds,12-15 years,3,3,3,3,3,1,Wiry,Short,5,3,3,4,3,3,3,3
1,Afghan Hound,25-27 inches,50-60 pounds,12-15 years,3,3,3,1,4,1,Silky,Long,3,3,3,3,1,4,3,3
2,Airedale Terrier,23 inches,50-70 pounds,11-14 years,3,3,3,1,3,1,Wiry,Short,3,3,5,3,3,3,3,3
3,Akita,"26-28 inches (male), 24-26 inches (female)","100-130 pounds (male), 70-100 pounds (female)",10-13 years,3,3,1,3,3,1,Double,Medium,2,3,5,3,3,4,2,3
4,Alaskan Klee Kai,"13 inches and under (Toy), 13-15 inches (Minia...","6-12 lbs (Toy), 10-18 lbs (Miniature), 16-25 l...",13-16 years,3,3,5,3,3,1,Double,Medium,1,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,Wirehaired Vizsla,"23-25 inches (male), 21.5-23 inches (female)","55-65 pounds (male), 45-55 pounds (female)",12-14 years,5,5,3,3,1,2,Wiry,Short,5,5,3,4,5,5,3,5
279,Working Kelpie,19-25 inches,28-60 pounds,12-15 years,5,5,3,3,1,2,Smooth,Short,5,5,4,4,5,5,3,5
280,Xoloitzcuintli,"10-14 inches (toy), 14-18 inches (miniature), ...","10-15 pounds (toy), 15-30 pounds (miniature), ...",13-18 years,5,3,3,1,1,1,Hairless,Short,3,4,3,4,4,4,3,4
281,Yakutian Laika,21-23 inches,40-55 pounds,10-12 years,5,5,4,4,2,3,Double,Medium,1,4,5,4,3,5,4,5
