In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata

In [13]:
def normalize(s):
    return unicodedata.normalize("NFD",s).encode('ascii','ignore').decode("utf-8")


In [2]:
urls = ["https://www.nationalgeographic.com/animals/birds/facts/gentoo-penguin",
        "https://www.nationalgeographic.com/animals/birds/facts/adelie-penguin",
        "https://www.nationalgeographic.com/animals/birds/facts/chinstrap-penguin"]

In [3]:
def get_html(url):
    res = requests.get(url)
    if res.status_code == 200:
        return BeautifulSoup(res.text)
    raise Exception("Bad Request")

In [4]:
def get_facts(html):
    top_facts = html.select("div.FastFacts__TopFacts__Data")
    facts = {}
    for f in top_facts:
        key = f.select_one("dt").get_text()
        value = f.select_one("dd").get_text()
        facts[key] = value
        
    bottom_facts_status = html.select_one("div.FastFacts__BottomFacts__Status > div >dl")
    bottom_facts_status_key = bottom_facts_status.select_one("dt").get_text()
    bottom_facts_status_value = bottom_facts_status.select_one("dd").get_text()
    facts[bottom_facts_status_key] = bottom_facts_status_value
    
    bottom_facts_data = html.select_one("div.FastFacts__BottomFacts__Data > dl")
    bottom_facts_data_key = bottom_facts_data.select_one("dt").get_text()
    bottom_facts_data_value = bottom_facts_data.select_one("dd").get_text()
    bottom_facts_data_key,bottom_facts_data_value
    facts[bottom_facts_data_key] = bottom_facts_data_value

    return clean_dict(facts)

In [5]:
def clean_dict(d):
    clean = {}
    for field in d:
        clean_key = field.replace(":","").replace("?","").strip().replace(" ","_").lower()
        clean[clean_key] = normalize(d[field].strip().lower())
    return clean

In [6]:
data = []
for url in urls:
    html = get_html(url)
    facts = get_facts(html)
    data.append({**facts, "source_url":url})

df = pd.DataFrame.from_dict(data)

In [7]:
df

Unnamed: 0,common_name,scientific_name,type,diet,group_name,average_life_span_in_the_wild,size,weight,iucn_red_list_status,current_population_trend,source_url
0,gentoo penguin,pygoscelis papua,birds,carnivore,colony,15 to 20 years,30 inches,12 pounds,near threatened,decreasing,https://www.nationalgeographic.com/animals/bir...
1,adélie penguin,pygoscelis adeliae,birds,carnivore,colony,11 to 20 years,27.5 inches,8.5 to 12 pounds,near threatened,increasing,https://www.nationalgeographic.com/animals/bir...
2,chinstrap penguin,pygoscelis antarcticus,,carnivore,colony,,28 inches,6.6–11.0 pounds,least concern,decreasing,https://www.nationalgeographic.com/animals/bir...


In [8]:
df.to_csv("species.csv", index=False)