In [1]:
import re
import requests
import os
from datetime import datetime

from bs4 import BeautifulSoup
from bs4.element import Tag

import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
def get_description(breed_soup):
    try:
        description = breed_soup.find(
                'div', {'id': 'lister-collapseOverview', 'class': 'collapse show'}
        ).get_text().strip()
    except:
        description = ''
    # Removing weird characters
    # Probably not exhaustive
    description = description.replace(
        '\n', '').replace('\u200b', '').replace('\xa0', ' ')
    return description

In [5]:
def get_personality(breed_soup):
    try:
        personality_div = breed_soup.find(
            'div', {'id': 'lister-collapsePersonality'}
        )
        personality = personality_div.find('div', {'class': 'card-body'}).find('p').get_text().strip()
    except:
        personality = ''
    personality = personality.replace('\n', '').replace('\u200b', '').replace('\xa0', ' ')
    return personality

def get_history(breed_soup):
    try:
        history_div = breed_soup.find(
            'div', {'id': 'lister-collapseHistory'}
        )
        history = history_div.find('div', {'class': 'card-body'}).find('p').get_text().strip()
    except:
        history = ''
    history = history.replace('\n', '').replace('\u200b', '').replace('\xa0', ' ')
    return history

In [6]:
def fetch_table_data(breed_soup):
    
    table = breed_soup.find('table', {'class': 'table table-bordered'})

    
    data = {}
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                key = cols[0].get_text(strip=True)
                value = cols[1].get_text(strip=True)
                data[key] = value

    return data

In [7]:
class Breed:
    def __init__(self, url):
        self.url = url
        breed_page = requests.get(url)
        breed_soup = BeautifulSoup(breed_page.content, 'html.parser')
        self.breed_info = {}
        self.breed_info['description'] = get_description(breed_soup)
        self.breed_info['personality'] = get_personality(breed_soup)
        self.breed_info['history'] = get_history(breed_soup)
        self.breed_info.update(fetch_table_data(breed_soup))
    def get_info(self):  
        return self.breed_info

In [8]:
def fetch_breed_urls(base_url):
    response = requests.get(base_url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    breed_urls = {}

    breed_list = soup.find_all('div', {'class': 'card-body'})
    for breed in breed_list:
        links = breed.find_all('a')
        for link in links:
            li_tag = link.find('li')
            if li_tag:
                breed_name = li_tag.get_text(strip=True)
                breed_url = link['href']
                full_url = f"https://www.yappy.com{breed_url}"
                breed_urls[breed_name] = full_url

    return breed_urls

In [9]:
def get_data():
    base_url = 'https://www.yappy.com/uk/dogs/breeds'
    breed_urls = fetch_breed_urls(base_url)

    breed_dict = {}
    for breed_name, url in tqdm(breed_urls.items()):
        breed = Breed(url)
        breed_dict[breed_name] = breed.get_info()

    return breed_dict

In [10]:
breed_dict = get_data()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for breed_name, url in tqdm(breed_urls.items()):


  0%|          | 0/343 [00:00<?, ?it/s]

In [11]:
breed_df = pd.DataFrame.from_dict(
    breed_dict, orient='index'
)
breed_df

Unnamed: 0,description,personality,history,Kennel Club Group,Lifespan,Height (at the withers),Weight,Coat,Colour,Eye colour,Common health issues,Other Names
Affenpinscher,These dogs have very unique faces and are said...,These little monkeys are known to be extremely...,The German translation for Affenpinscher is 'm...,Toy,11 - 14 Years,Males and Females 23cm – 30cm,Males and Females 3kg – 6kg,"The Coat is Rough, Short and Dense, as well as...",Black,Dark,"Patella Subluxation, Legg-Calves-Perthes disea...","Affen, Affie, Monkey Dog, Monkey Mutt"
Afghan Hound,"These dogs are extremely glamorous to look at,...",These sleek creatures are affectionate as pupp...,The Afghan Hound is supposedly one of the olde...,Hound,11 - 13 Years,"Males 68cm - 74cm, Females 63cm - 69cm","Males 23kg - 27kg, Females 20kg - 25kg",Thick And Soft to the Touch. Fine in Texture,"Black, Black & Brindle, Black & Cream, Black &...",Dark,"Hip dysplasia, Generalised demodicosis (mange)...","Afghan, Tazi, Baluchi Hound"
Airedale Terrier,These dogs are one of the larger breeds of Ter...,These happy chappy's thrive most when there is...,These fluffy guys are often referred to as 'Ki...,Terrier,11 - 12 years,"Males 56cm - 61cm, Females 56cm - 58cm","Males 23kg - 29kg, Females 18kg - 20kg","Outer Coat is Dense and Wiry, Lying Close to t...","Black & Gold, Black & Tan",Dark,"Hip dysplasia, Elbow dysplasia, Autoimmune thy...",Airedale
Akita,These dogs are known for their intelligent yet...,What we really love about this breed here at Y...,These beautiful creatures have gotten pretty c...,Utility,10 – 15 Years,"Males 64cm – 70 cm, Females 58cm – 64cm","Males 34kg – 54kg, Female 34kg – 50kg",Two coat types; Long and Short Coat. The long ...,"White, Brindle, Fawn, Red Fawn, Tan, Pinto, Se...",Dark,"Mycrocytosis, Autoimmune Hypothyroiditis, Akit...","Akita Inu, American Akita, Japanese Akita, Gre..."
Alaskan Klee Kai,"Small, smart, and energetic, this dog is a rel...","This breed is an intelligent, high-activity do...","It was during the mid-1970s, on a trip to Okla...",,12 - 16 years,Males and Females 25cm - 43cm,Males and Females 3.5kg - 10.5kg,"Double coated, the undercoat is soft and short...","Sable, Silver, Red, Brown, Grey, Black","Blue, Brown, Amber","Juvenile cataracts, liver disease, cardiac iss...","AKK, Klee Kai, Miniature Alaskan Husky, Mini H..."
...,...,...,...,...,...,...,...,...,...,...,...,...
Working Cocker Spaniel,Working Cocker Spaniels are an incredibly hard...,"Training them is so rewarding, as they are ass...","Prior to the 1600s, all breeds of Spaniel were...",Gundog,12 - 15 Years,Males and Females 46cm – 56cm,Males and Females 12kg – 16kg,"Straight, Silky","Chocolate, Tan, Black and White","Blue, Brown, Black","Hip Dysplasia, Atopy, Ear Problems, Eye Problems",
Yorkie Russell,"The Jorkie, or 'Yorkie Russell' is a cross bet...","Just like their parents, Jorkie's are incredib...",While the recent designer pooch has little sta...,,11 - 15 years,Males and Females 20cm - 38cm,Males and Females 2kg - 8kg,Medium/Long Coat that is Mildly Dense and Stra...,"Silver, Fawn, Brown, Blue, White, Black",Brown,"Portacaval Shunt, Progressive Retinal Atrophy,...",Jorkie
Yorkipoo,"A relatively new 'designer' cross breed, the Y...",Yorkie-Poo's are very affectionate doggies and...,"Yorkipoos are a newer breed, with their histor...",,10 – 13 years,Male and Female 10cm – 35cm,Male and Female 3kg – 7kg,"Long, Silky soft coat","Silver, Cream and Brown.",Brown,"Patellar Luxation, Epilepsy, Hypothyroidism, A...","Yoodle, Yorkapoo, Yorkerpoo, Yorkiepoopoo, Yor..."
Yorkshire Terrier,The Yorkshire Terrier first came about during ...,"Although they might be small, this breed is fu...",The Yorkshire Terrier got its name from its pl...,Toy,13-16 years,Males and Females 20cm - 23cm,Males and Females 2kg -3kg,"Long, Silky, Perfectly Straight Single Coat",Blue and Tan,Brown,Patellar Luxation; Progressive Retinal Atrophy...,Yorkie


In [12]:

breed_df.to_csv('data/akc-data-latest.csv')

In [13]:
breed_df.describe(include='all')

Unnamed: 0,description,personality,history,Kennel Club Group,Lifespan,Height (at the withers),Weight,Coat,Colour,Eye colour,Common health issues,Other Names
count,343,343.0,343.0,211,340,340,340,340,340,335,326,280
unique,343,336.0,335.0,16,115,298,308,333,318,87,307,278
top,These dogs have very unique faces and are said...,,,Pastoral,12 - 15 Years,Males and Females 25cm - 30cm,Males and Females 4.5kg - 9kg,"Coats are Long, with an Outer and Undercoat.",White,Brown,Hip Dysplasia,"Akita Inu, American Akita, Japanese Akita, Gre..."
freq,1,8.0,9.0,32,32,5,3,4,7,121,10,2
