# Dog Breeds - Data Science project

# Step 1 : Data Collection

In this step, we will collect information regarding dog breeds.
We will crawl along web pages and scrape information about dog breeds.

In [11]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

In [12]:
URL = 'https://dogtime.com/dog-breeds/profiles/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

results = soup.find_all('a', class_='list-item-title')

In [13]:
# num of breed
len(results)

389

In [14]:
# Get link of dog breed
results[0]['href']

'https://dogtime.com/dog-breeds/afador'

In [15]:
# Get name of dog breed
results[0].text

'Afador'

In [16]:
# We will fill a list with name & link values
dog_breed = [element.text for element in results]
    
URL_dog_breed = [element['href'] for element in results]

In [17]:
# DataFrame with name and url
df = pd.DataFrame({'Dog Breed': dog_breed, 'URL dog breed': URL_dog_breed})

df

Unnamed: 0,Dog Breed,URL dog breed
0,Afador,https://dogtime.com/dog-breeds/afador
1,Affenhuahua,https://dogtime.com/dog-breeds/affenhuahua
2,Affenpinscher,https://dogtime.com/dog-breeds/affenpinscher
3,Afghan Hound,https://dogtime.com/dog-breeds/afghan-hound
4,Airedale Terrier,https://dogtime.com/dog-breeds/airedale-terrier
...,...,...
384,Whoodle,https://dogtime.com/dog-breeds/whoodle
385,Wirehaired Pointing Griffon,https://dogtime.com/dog-breeds/wirehaired-poin...
386,Xoloitzcuintli,https://dogtime.com/dog-breeds/xoloitzuintli
387,Yorkipoo,https://dogtime.com/dog-breeds/yorkipoo


In [18]:
def get_num_star(string):
    return [int(letter) for letter in str(string) if letter.isdigit()][0]

In [19]:
def get_and_clean_box_data(soup):
    
    data = (soup.text).split(':')

    return data

In [20]:
# Function that get breed URL , and And adds the breed information to the data dict
def load_breed_data(breed_url,data):
    ###
    page = requests.get(breed_url)
    soup_page = BeautifulSoup(page.content, 'html.parser')

    content = soup_page.find_all('div', class_='breed-characteristics-ratings-wrapper paws')
    
    rating = []
    self_params = []
    
    ## Get Vital box params

    box_soup = soup_page.find('div', class_ = 'breed-vital-stats-wrapper')
    
    for i in range(len(box_soup)):
        
        req = box_soup.find_all('div', class_='vital-stat-box')[i]
        
        box_data = get_and_clean_box_data(req)
        
        if not (box_data[0] in params):
            params.append(box_data[0])
            data[box_data[0]] = []
        
        if len(box_data) == 2:
            self_params.append(box_data[0])
            rating.append(box_data[1])

    ##

    for el in content:
        self_params.append(el('h3')[0].text.strip())
        if not(el('h3')[0].text.strip() in params):
            params.append(el('h3')[0].text.strip())
            data[el('h3')[0].text.strip()] = []

        for element in el('div',class_='star'):
            rating.append(get_num_star(element))

        for element in el('div',class_='characteristic-title'):
            self_params.append(element.text)
            if not (element.text in params):
                params.append(element.text)
                data[element.text] = []
      
    if len(rating) != 35 or len(self_params) != 35:
        for parameter in params:
            if not (parameter in self_params):
                self_params.append(parameter)
                rating.append(np.nan)
                print(parameter)
                
        print('---------------')
        print(len(rating))
        print(len(self_params))
        print('---------------' + '  ' + breed_url + '  ' + '---------------')
        
    
    for i in range(len(self_params)):
        data[self_params[i]].append(rating[i])



In [21]:
params = []

breed_data = {}

breed_data['breed name'] = []

for i in range(len(results)):
    breed_data['breed name'].append(dog_breed[i])
    load_breed_data(URL_dog_breed[i],breed_data)
    print(str(i) + ' ' + dog_breed[i])

0 Afador
1 Affenhuahua
2 Affenpinscher
3 Afghan Hound
4 Airedale Terrier
5 Akbash
6 Akita
7 Akita Chow
8 Akita Pit
9 Akita Shepherd
10 Alaskan Klee Kai
11 Alaskan Malamute
12 American Bulldog
Weight

---------------
36
36
---------------  https://dogtime.com/dog-breeds/american-english-coonhound  ---------------
13 American English Coonhound
14 American Eskimo Dog
15 American Foxhound
16 American Hairless Terrier
17 American Leopard Hound
18 American Pit Bull Terrier
19 American Pugabull
20 American Staffordshire Terrier
21 American Water Spaniel
22 Anatolian Shepherd Dog
23 Appenzeller Sennenhunde
24 Auggie
25 Aussiedoodle
26 Aussiepom
27 Australian Cattle Dog
28 Australian Kelpie
29 Australian Retriever
30 Australian Shepherd
31 Australian Shepherd Husky
32 Australian Shepherd Lab Mix
33 Australian Shepherd Pit Bull Mix
34 Australian Stumpy Tail Cattle Dog
35 Australian Terrier
36 Azawakh
37 Barbet
38 Basenji
39 Bassador
40 Basset Fauve de Bretagne
41 Basset Hound
42 Basset Retriever

In [23]:
# Print data
breed_data

{'breed name': ['Afador',
  'Affenhuahua',
  'Affenpinscher',
  'Afghan Hound',
  'Airedale Terrier',
  'Akbash',
  'Akita',
  'Akita Chow',
  'Akita Pit',
  'Akita Shepherd',
  'Alaskan Klee Kai',
  'Alaskan Malamute',
  'American Bulldog',
  'American English Coonhound',
  'American Eskimo Dog',
  'American Foxhound',
  'American Hairless Terrier',
  'American Leopard Hound',
  'American Pit Bull Terrier',
  'American Pugabull',
  'American Staffordshire Terrier',
  'American Water Spaniel',
  'Anatolian Shepherd Dog',
  'Appenzeller Sennenhunde',
  'Auggie',
  'Aussiedoodle',
  'Aussiepom',
  'Australian Cattle Dog',
  'Australian Kelpie',
  'Australian Retriever',
  'Australian Shepherd',
  'Australian Shepherd Husky',
  'Australian Shepherd Lab Mix',
  'Australian Shepherd Pit Bull Mix',
  'Australian Stumpy Tail Cattle Dog',
  'Australian Terrier',
  'Azawakh',
  'Barbet',
  'Basenji',
  'Bassador',
  'Basset Fauve de Bretagne',
  'Basset Hound',
  'Basset Retriever',
  'Bavarian

In [37]:
try:
    breed_data.pop('')
    df = pd.DataFrame(data = breed_data)
except:
    df = pd.DataFrame(data = breed_data)

df

Unnamed: 0,breed name,Dog Breed Group,Height,Weight,Life Span,Adaptability,Adapts Well To Apartment Living,Good For Novice Owners,Sensitivity Level,Tolerates Being Alone,...,Intelligence,Potential For Mouthiness,Prey Drive,Tendency To Bark Or Howl,Wanderlust Potential,Physical Needs,Energy Level,Intensity,Exercise Needs,Potential For Playfulness
0,Afador,Mixed Breed Dogs,20 to 29 inches,50 to 75 pounds,10 to 12 years,2,1,1,3,3,...,5,4,4.0,4.0,4,4,4,4,4,3
1,Affenhuahua,Mixed Breed Dogs,6 to 12 inches,4 to 12 pounds,13 to 18 years,3,4,4,4,1,...,3,4,2.0,4.0,2,3,4,3,3,3
2,Affenpinscher,Companion Dogs,9 to 11 inches tall at the shoulder,7 to 9 pounds,12 to 14 years,3,5,4,3,1,...,4,4,3.0,2.0,2,4,4,3,3,4
3,Afghan Hound,Hound Dogs,24 to 26 inches tall at the shoulder,50 to 60 pounds,10 to 12 years,4,5,3,5,2,...,4,3,5.0,2.0,5,4,5,2,4,4
4,Airedale Terrier,Terrier Dogs,21 to 23 inches tall at the shoulder,40 to 65 pounds,10 to 13 years,2,1,2,3,2,...,5,5,5.0,4.0,4,5,5,3,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,Whoodle,Mixed Breed Dogs,12 to 20 inches,20 to 45 pounds,12 to 15 years,3,4,1,3,1,...,4,1,2.0,1.0,3,4,4,4,4,4
385,Wirehaired Pointing Griffon,Sporting Dogs,20 to 24 inches tall at the shoulder,50 to 60 pounds,10 to 14 years,3,1,3,4,1,...,5,3,4.0,4.0,4,4,5,3,4,5
386,Xoloitzcuintli,Companion Dogs,"1 foot, 6 inches to 1 foot, 11 inches tall at ...",10 to 50 pounds,14 to 20 years,3,5,1,5,1,...,5,3,5.0,5.0,5,3,3,3,3,3
387,Yorkipoo,Hybrid Dogs,7 to 15 inches tall at the shoulder,3 to 14 pounds,10 to 15 years,4,5,5,4,3,...,4,3,3.0,5.0,2,4,5,3,3,4


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389 entries, 0 to 388
Data columns (total 36 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   breed name                       389 non-null    object 
 1   Dog Breed Group                  389 non-null    object 
 2   Height                           388 non-null    object 
 3   Weight                           383 non-null    object 
 4   Life Span                        389 non-null    object 
 5   Adaptability                     389 non-null    int64  
 6   Adapts Well To Apartment Living  389 non-null    int64  
 7   Good For Novice Owners           389 non-null    int64  
 8   Sensitivity Level                389 non-null    int64  
 9   Tolerates Being Alone            389 non-null    int64  
 10  Tolerates Cold Weather           389 non-null    int64  
 11  Tolerates Hot Weather            389 non-null    int64  
 12  All Around Friendlines

In [39]:
df.isnull().sum()

breed name                         0
Dog Breed Group                    0
Height                             1
Weight                             6
Life Span                          0
Adaptability                       0
Adapts Well To Apartment Living    0
Good For Novice Owners             0
Sensitivity Level                  0
Tolerates Being Alone              0
Tolerates Cold Weather             0
Tolerates Hot Weather              0
All Around Friendliness            0
Affectionate With Family           0
Kid-Friendly                       0
Dog Friendly                       0
Friendly Toward Strangers          0
Health And Grooming Needs          0
Amount Of Shedding                 0
Drooling Potential                 1
Easy To Groom                      0
General Health                     0
Potential For Weight Gain          0
Size                               0
Trainability                       0
Easy To Train                      0
Intelligence                       0
P

In [41]:
# Export data to a csv file
df.to_csv("dog_breeds_data.csv")