# Preparation

In [None]:
import csv
import json
import random
import re
import time

from curl_cffi import requests
from numpy import mean

In [None]:
def random_delay(min: int = 5, max: int = 10, log: bool = False, indents: int = 0) -> None:
    delay = random.uniform(min, max)

    if log == True:
        print(indents*"\t" + f"--- (random delay: {delay:.2f} seconds) ---", end = "")

    time.sleep(delay)

In [None]:
def find_value_recursively__with_the_recursion_level(data: list|dict, sought_value, level: int) -> list:
    if type(data) == list:
        data = dict(enumerate(data))
    elif type(data).__name__ != "dict":
        print("Is not dict: ", data)
        return []

    # Check if the value is at the current level
    for key, value in data.items():
        if value == sought_value:
            print("\t"*level + f"Found value at: data[{key}]")
            return [key]

    # Check if a further recursion is possible
    iterable_values_exist = False
    for key, value in data.items():
        try:
            iter(value)
            if not iterable_values_exist:
                iterable_values_exist = True
        except TypeError:
            pass
    if not iterable_values_exist:
        print("\t"*level + f"No iterable values in {data}")
        return []

    # Do the recursion to find the value at a deeper level
    for key, value in data.items():
        keys_path = [key]
        print("\t"*level + f"Searching values at: data[{key}]")
        keys_recursively = find_value_recursively__with_the_recursion_level(data[key], sought_value, level + 1)
        if keys_recursively:
            res = keys_path + keys_recursively
            print("\t"*level + f"Returning the current key path: {res}")
            return res

    print("\t"*level + "Didn't find the value")
    return []

def find_value_recursively(data: dict, sought_value) -> list:
    return find_value_recursively__with_the_recursion_level(data, sought_value, 0)

# Scraping the data

In [None]:
def get_all_breeds(breeds_url: str = "https://www.akc.org/dog-breeds/") -> dict:
    with requests.Session(impersonate="chrome110") as session:
        response = session.get(breeds_url)
        response.raise_for_status()

    for line in response.text.splitlines():
        if re.search("&quot;menu_panels", line):
            result_data = re.search('([^{]+)(.*)("></div>)', line).groups()[1].replace('&quot;', '"').replace('\\', '')
            break

    dogs_data_unformatted = json.loads(result_data)
    dogs_data = dogs_data_unformatted['menu_panels'][0]['columns'][1]['nav_tab_section'][0]['0'][1]['breed_explorer']['breed_select']['options']

    return dogs_data

In [None]:
def get_breed_data(breed_url: str = "https://www.akc.org/dog-breeds/affenpinscher") -> dict:
    with requests.Session(impersonate="chrome110") as session:
        response_breed = session.get(breed_url)
        response_breed.raise_for_status()

    for line in response_breed.text.splitlines():
        if re.search("&quot;traits&quot;", line):
            line_formatted = line.replace('&quot;', '"').replace(r'\/', '/')
            result_breed_data = re.search('("breed_data":)(.*)(,"wp_breed_api_url")', line_formatted).groups()[1]
            break

    return json.loads(result_breed_data)

In [None]:
ALL_BREEDS = get_all_breeds()
BREEDS_DICT = {}

for breed in ALL_BREEDS:
    breed_name = breed['text']
    breed_url = breed['value']
    print(f"{breed_name}... ", end = "")

    if BREEDS_DICT:
        random_delay(log = True)

    exceptions_count = 0
    while True:
        try:
            breed_data = get_breed_data(breed_url)
        except requests.RequestsError as e:
            print("\n", e, end = "")
            exceptions_count += 1
            time.sleep(5)
            continue
        break

    breed_size = list(breed_data['standards'].items())[0][1].get('size')
    BREEDS_DICT[breed_name] = {'size': breed_size}

    breed_traits = list(breed_data['traits'].items())[0][1].get('traits')
    if breed_traits:
        BREEDS_DICT[breed_name].update(breed_traits)

    if exceptions_count:
        print()
    print(" success!")

# save all the data collected
filename = "breeds-raw-data.json"
json.dump(BREEDS_DICT, open(filename, "w"), indent = 4)
print(f"\nRaw data for {len(BREEDS_DICT)} breeds saved successfully into the file:", filename)

# Formatting the data

In [71]:
BREED_TRAITS_SCORES_DICTS = []
TRAIT_NAMES = set()

for breed in get_all_breeds():
    breed_name = breed['text']
    breed_url = breed['value']
    # print(f"{breed_name}")

    traits_dict = BREEDS_DICT[breed_name]
    breed_traits_scores_dict = {'breed_name': breed_name,
                                'breed_url': breed_url}

    for trait_name, trait_values in traits_dict.items():
        trait_final_value = 'Unspecified'

        if trait_name == 'size':
            # print("\ttrait_name is size")

            if trait_values:
                trait_final_value = trait_values

                regex = re.fullmatch(r"&lt;p&gt;([A-Za-z]+)&lt;\/p&gt;", trait_final_value)
                if regex:
                    trait_final_value = regex.groups()[0]

            # else:
                # print("\tbut no trait_values")

        elif trait_name == 'coat_type' or trait_name == 'coat_length':
            # print("\ttrait_name is coat_type or coat_length")

            if trait_values['selected']:
                if trait_values['selected'][0] != "0":
                    trait_final_value = trait_values['selected'][0]
            # else:
                # print("\tbut no trait_values['selected']")

        elif 'score' in trait_values:
            # print("\tscore is in trait_values")
            trait_final_value = trait_values['score']

        else:
            continue

        # print(f"\t{trait_name} = {trait_final_value}\n")
        breed_traits_scores_dict[trait_name] = trait_final_value
        TRAIT_NAMES.add(trait_name)


    BREED_TRAITS_SCORES_DICTS += [breed_traits_scores_dict]
    # print()

print(f"Formatted data for {len(BREED_TRAITS_SCORES_DICTS)} breeds.")

Formatted data for 292 breeds.


In [67]:
import openpyxl
from openpyxl.styles import Font

workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = "Breeds"

headers = ['name'] + [column for column in TRAIT_NAMES if column != 'breed_url']
sheet.append(headers)

for breed_dict in BREED_TRAITS_SCORES_DICTS:
    row = []
    row.append(f'=HYPERLINK("{breed_dict["breed_url"]}", "{breed_dict["breed_name"]}")')
    for col in headers[1:]:
        row.append(breed_dict.get(col))
    sheet.append(row)

bold_font = Font(bold=True)
for cell in sheet[1]:
    cell.font = bold_font
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=1, max_col=1):
    for cell in row:
        cell.font = bold_font

sheet.freeze_panes = "B2"

filename = "breeds-formatted.xlsx"
workbook.save(filename)
print(filename, "- saved successfully!")

breeds-formatted.xlsx - saved successfully!
