In [1]:
import re
import requests

from bs4 import BeautifulSoup
from bs4.element import Tag

import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
page = requests.get('https://www.akc.org/dog-breeds/')

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
breed_select = soup.find('select', id='breed-search')

In [5]:
tags = [
    tag for tag in breed_select.children if type(tag) is Tag
]

breeds = {
    breed.get_text(): breed['value'] for breed in tags if breed['value']
}

In [10]:
def get_description(breed_soup):
    try:
        first_part = breed_soup.find(
                'div', class_='breed-info__content-wrap'
        ).get_text().strip()
    except:
        first_part = ''
    
    try:
        second_part = breed_soup.find(
                'div', class_='breed-hero__footer'
        ).get_text().strip()
    except:
        second_part = ''
    
    description = ' '.join([first_part, second_part])
    description = description.replace(
        '\n', '').replace('\u200b', '').replace('\xa0', ' ')
    return description

def get_temperament(breed_soup):
    first_part = 'attribute-list__description attribute-list__text '
    second_part = 'attribute-list__text--lg mb4 bpm-mb5 pb0 d-block'
    class_ = first_part + second_part
    return breed_soup.find(
        'span', class_=class_
    ).get_text()

def get_popularity(popularity_span):
    pop_text = popularity_span.get_text()
    return {'popularity': pop_text.split()[1]}

def simple_interval(text, var, mul):
    values = text.split()[0].split('-')
    numbers = [float(value) * mul for value in values]
    if len(numbers) == 1:
        numbers = numbers * 2
    return {
        'min_{}_male'.format(var): numbers[0],
        'max_{}_male'.format(var): numbers[1],
        'min_{}_female'.format(var): numbers[0],
        'max_{}_female'.format(var): numbers[1]
    }

def regex_interval(text, category, unit, mul):
    reg = re.compile('(\d+\.?\d*-?\d*\.?\d*) {} \({}\)'.format(unit, category))
    result = reg.search(text)
    values = result[1].split('-')
    numbers = [float(value) * mul for value in values]
    if len(numbers) == 1:
        numbers = numbers * 2
    return numbers

def gender_interval(text, var, unit, mul):
    try:
        numbers_male = regex_interval(text, 'male', unit, mul)
        numbers_female = regex_interval(text, 'female', unit, mul)
    except:
        numbers_male = regex_interval(text, 'males', unit, mul)
        numbers_female = regex_interval(text, 'females', unit, mul)
    return {
        'min_{}_male'.format(var): numbers_male[0],
        'max_{}_male'.format(var): numbers_male[1],
        'min_{}_female'.format(var): numbers_female[0],
        'max_{}_female'.format(var): numbers_female[1]
    }

def size_interval(text, var, unit, mul):
    numbers = []
    try:
        numbers.extend(regex_interval(text, 'small', unit, mul))
    except:
        pass
    try:
        numbers.extend(regex_interval(text, 'medium', unit, mul))
    except:
        pass
    try:
        numbers.extend(regex_interval(text, 'large', unit, mul))
    except:
        pass
    try:
        numbers.extend(regex_interval(text, 'toy', unit, mul))
    except:
        pass
    try:
        numbers.extend(regex_interval(text, 'miniature', unit, mul))
    except:
        pass
    try:
        numbers.extend(regex_interval(text, 'standard', unit, mul))
    except:
        pass
    
    return {
        'min_{}_male'.format(var): min(numbers),
        'max_{}_male'.format(var): max(numbers),
        'min_{}_female'.format(var): min(numbers),
        'max_{}_female'.format(var): max(numbers)
    }


def general_regex(text, var, mul=1):
    reg = re.compile('(\d+\.?\d*)')
    results = reg.findall(text)
    numbers = [float(value) * mul for value in results]
    if len(numbers) == 1:
        numbers = numbers * 2
    elif len(numbers) == 0:
        numbers = [0, 0]
    return {
        'min_{}'.format(var): min(numbers),
        'max_{}'.format(var): max(numbers)
    }

def get_height(height_span):
    ht_text = height_span.get_text().replace(
        'up to ', '0-'
    ).replace('under ', '0-')  
    return general_regex(ht_text, 'height', 2.54)
    
#     if '(' not in ht_text:
#         return simple_interval(ht_text, 'height', 2.54)
#     elif 'female' in ht_text:
#         return gender_interval(ht_text, 'height', 'inches', 2.54)
#     else:
#         return size_interval(ht_text, 'height', 'inches', 2.54)

def get_weight(weight_span):
    wt_text = weight_span.get_text().replace(
        'up to ', '0-'
    ).replace('under ', '0-')
    return general_regex(wt_text, 'weight', 0.45359237) 
#     if '(' not in wt_text:
#         return simple_interval(wt_text, 'weight', 0.45359237)
#     elif 'female' in wt_text:
#         return gender_interval(wt_text, 'weight', 'pounds', 0.45359237)
#     else:
#         return size_interval(wt_text, 'weight', 'pounds', 0.45359237)

def get_expectancy(expectancy_span):
    exp_text = expectancy_span.get_text()
    return general_regex(exp_text, 'expectancy') 
#     values = exp_text.split()[0].split('-')
#     numbers = [int(value.replace('+', '')) for value in values]
#     if len(numbers) == 1:
#         numbers = numbers * 2
#     return {
#         'min_expectancy': numbers[0],
#         'max_expectancy': numbers[1],
#     }

def get_group(group_span):
    return {'group': group_span.get_text()}

attr_function = {
    'AKC Breed Popularity': get_popularity,
    'Height': get_height,
    'Weight': get_weight,
    'Life Expectancy': get_expectancy,
    'Group': get_group
}


def get_bar_charts(breed_soup):
    titles = breed_soup.find_all(
        'h4', class_='bar-graph__title'
    )
    
    values = breed_soup.find_all(
        'div', class_='bar-graph__section'
    )
    
    categories = breed_soup.find_all(
        'div', class_='bar-graph__text'
    )
    
    bar_dict = {}
    
    for (title, value, category) in zip (titles, values, categories):
        t = title.get_text().lower().replace(' ', '_')
        t = t[t.find('/') + 1:]
        bar_dict[t + '_value'] = float(
            value['style'].split()[1].split('%')[0]
        ) / 100
        bar_dict[t + '_category'] = category.get_text()
    
    return bar_dict


def get_breed_info(url):  
    print(url)
    breed_page = requests.get(url)
    breed_soup = BeautifulSoup(breed_page.content, 'html.parser')
    
    breed_info = {}
    breed_info['description'] = get_description(breed_soup)    
    breed_info['temperament'] = get_temperament(breed_soup)
    
    breed_attr_terms = breed_soup.find_all(
        'span', class_='attribute-list__term attribute-list__text'
    )[1:]
    breed_attr_values = breed_soup.find_all(
        'span', class_='attribute-list__description attribute-list__text'
    )
    
    for term_span, value_span in zip (breed_attr_terms, breed_attr_values):
        term = term_span.get_text().replace(':', '')
        breed_info.update(attr_function[term](value_span))
        
    breed_info.update(get_bar_charts(breed_soup))
    return breed_info
    
    

In [11]:
data = {
    breed: get_breed_info(url) for breed, url in tqdm(breeds.items())
}

HBox(children=(IntProgress(value=0, max=277), HTML(value='')))

https://www.akc.org/dog-breeds/affenpinscher/
https://www.akc.org/dog-breeds/afghan-hound/
https://www.akc.org/dog-breeds/airedale-terrier/
https://www.akc.org/dog-breeds/akita/
https://www.akc.org/dog-breeds/alaskan-malamute/
https://www.akc.org/dog-breeds/american-bulldog/
https://www.akc.org/dog-breeds/american-english-coonhound/
https://www.akc.org/dog-breeds/american-eskimo-dog/
https://www.akc.org/dog-breeds/american-foxhound/
https://www.akc.org/dog-breeds/american-hairless-terrier/
https://www.akc.org/dog-breeds/american-leopard-hound/
https://www.akc.org/dog-breeds/american-staffordshire-terrier/
https://www.akc.org/dog-breeds/american-water-spaniel/
https://www.akc.org/dog-breeds/anatolian-shepherd-dog/
https://www.akc.org/dog-breeds/appenzeller-sennenhund/
https://www.akc.org/dog-breeds/australian-cattle-dog/
https://www.akc.org/dog-breeds/australian-kelpie/
https://www.akc.org/dog-breeds/australian-shepherd/
https://www.akc.org/dog-breeds/australian-stump-tail-cattle-dog/
h

https://www.akc.org/dog-breeds/mountain-cur/
https://www.akc.org/dog-breeds/mudi/
https://www.akc.org/dog-breeds/neapolitan-mastiff/
https://www.akc.org/dog-breeds/nederlandse-kooikerhondje/
https://www.akc.org/dog-breeds/newfoundland/
https://www.akc.org/dog-breeds/norfolk-terrier/
https://www.akc.org/dog-breeds/norrbottenspets/
https://www.akc.org/dog-breeds/norwegian-buhund/
https://www.akc.org/dog-breeds/norwegian-elkhound/
https://www.akc.org/dog-breeds/norwegian-lundehund/
https://www.akc.org/dog-breeds/norwich-terrier/
https://www.akc.org/dog-breeds/nova-scotia-duck-tolling-retriever/
https://www.akc.org/dog-breeds/old-english-sheepdog/
https://www.akc.org/dog-breeds/otterhound/
https://www.akc.org/dog-breeds/papillon/
https://www.akc.org/dog-breeds/parson-russell-terrier/
https://www.akc.org/dog-breeds/pekingese/
https://www.akc.org/dog-breeds/pembroke-welsh-corgi/
https://www.akc.org/dog-breeds/perro-de-presa-canario/
https://www.akc.org/dog-breeds/peruvian-inca-orchid/
https:

AttributeError: 'NoneType' object has no attribute 'get_text'