In [None]:
!pip3 install python-dotenv

In [19]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from dotenv import load_dotenv
import os
from tqdm import tqdm

In [10]:
from glob import glob
path = "data/bird_images/valid"
files = glob(f'{path}/**/*.jpg', recursive=True)

In [None]:
from PIL import Image  
import PIL

path = "data/bird_images/valid_flattened/"
new_filenames = []
for file in files:
    file_components = file.split("/")
    if int(file_components[-1][0]) > 2: continue
    new_filename = f"{file_components[-2]}-{file_components[-1][0]}"
    new_filenames.append(new_filename)

with open('data/bird_icon_names.txt', 'w+') as f:
    for items in new_filenames:
        f.write('%s\n' %items)
    print("File written successfully")
f.close()

In [None]:
newsize = (185, 185)
for file in files:
    img = Image.open(file)
    img.resize(newsize)
    file_components = file.split("/")
    if int(file_components[-1][0]) > 2: continue
    new_filename = f"{file_components[-2]}-{file_components[-1]}"
    img.save(f"{path}{new_filename}")

# Scraping ebird

In [None]:
df = pd.read_csv('data/ebird data.txt', sep="\t", header=0)

In [None]:
df.columns

In [None]:
df["COUNTY CODE"][:10]

In [None]:
df.COUNTY[:10]

In [11]:
BASE_URL = "https://ebird.org/checklist/S177363513"

In [13]:
r = requests.get(BASE_URL)
r.raise_for_status()

soup = BeautifulSoup(r.content, 'html5lib')

In [57]:
def build_media_data_for_checklist(soup):
    species = soup.findAll('li', attrs = {'data-observation':True})
    
    species_json = []
    for s in species:
        species_info = s.find('div', attrs={'class':'Observation-species'})
        species_soup = BeautifulSoup(str(s))
        # print(species_soup.prettify())
        species_link = species_info.find('a')
        href = ""
        if species_link:
            href = species_info.find('a')['href']
        name = species_info.find('span').text
        media_id_elements = s.findAll('div', attrs = {'data-media-commonname':name})
        if not len(media_id_elements): 
            s_json = {'href':href, 'name':name, 'media_ids':[]}
            species_json.append(s_json)
            continue
        media_ids = []
        for e in media_id_elements:
            media_ids.append(e["data-media-id"])
        s_json = {'href':href, 'name':name, 'media_ids':media_ids}
        species_json.append(s_json)
        
    return species_json

In [14]:
species_json = build_media_data_for_checklist(soup)
species_json

[{'href': '/species/wemhar1',
  'name': 'Western Marsh Harrier',
  'media_ids': ['619630301']},
 {'href': '/species/gstlar1',
  'name': 'Greater Short-toed Lark',
  'media_ids': ['619630341', '619630342', '619630343', '619631014']},
 {'href': '/species/banswa',
  'name': 'Bank Swallow',
  'media_ids': ['619630365', '619630366']}]

# Getting recent data nearby

In [68]:
def get_checklists(lat, lng, back, dist, headers):
    get_url = f"https://api.ebird.org/v2/data/obs/geo/recent?lat={lat}&lng={lng}&back={back}&dist={dist}"
    res = requests.get(get_url, headers=headers)
    data = res.json()
    checklists = [{"subId":ob["subId"], "lat":ob["lat"], "lng":ob["lng"]} for ob in data]
    checklists_unique = list({sub['subId']:sub for sub in checklists}.values())

    for checklist in tqdm(checklists_unique[:3]):
        checklist["observations"] = get_checklist(checklist["subId"])
        
    return checklists_unique

In [69]:
load_dotenv()
access_token = os.getenv("EBIRD_API_ACCESS_TOKEN")

headers =  {"Content-Type":"application/json", "x-ebirdapitoken": access_token}

lat = 40.0150
lng = -105.2705
back = 30
dist = 20

checklists = get_checklists(lat, lng, back, dist, headers)

100%|█████████████████████████████████████████████| 3/3 [00:02<00:00,  1.15it/s]


In [70]:
checklists[0]

{'subId': 'S183102872',
 'lat': 39.9579448,
 'lng': -105.3385663,
 'observations': [{'href': '/species/rethaw',
   'name': 'Red-tailed Hawk',
   'media_ids': []},
  {'href': '/species/wewpew', 'name': 'Western Wood-Pewee', 'media_ids': []},
  {'href': '/species/plsvir', 'name': 'Plumbeous Vireo', 'media_ids': []},
  {'href': '/species/stejay', 'name': "Steller's Jay", 'media_ids': []},
  {'href': '/species/bkbmag1', 'name': 'Black-billed Magpie', 'media_ids': []},
  {'href': '/species/houwre', 'name': 'House Wren', 'media_ids': []},
  {'href': '/species/wesblu', 'name': 'Western Bluebird', 'media_ids': []},
  {'href': '/species/chispa', 'name': 'Chipping Sparrow', 'media_ids': []},
  {'href': '/species/vesspa', 'name': 'Vesper Sparrow', 'media_ids': []}]}

In [5]:
len(data)

215

In [19]:
data[0]

{'speciesCode': 'rethaw',
 'comName': 'Red-tailed Hawk',
 'sciName': 'Buteo jamaicensis',
 'locId': 'L554723',
 'locName': 'Walker Ranch--Meyers Gulch',
 'obsDt': '2024-06-22 11:26',
 'howMany': 2,
 'lat': 39.9579448,
 'lng': -105.3385663,
 'obsValid': True,
 'obsReviewed': False,
 'locationPrivate': False,
 'subId': 'S183102872'}

In [6]:
subIds = [ob["subId"] for ob in data]

In [22]:
len(set(subIds))

111

In [7]:
subIds = list(set(subIds))

In [51]:
subIds[110]

'S181994102'

In [8]:
def get_checklist(subId):
    BASE_URL = f"https://ebird.org/checklist/{subId}"
    r = requests.get(BASE_URL)
    r.raise_for_status()
    
    soup = BeautifulSoup(r.content, 'html5lib')
    return build_media_data_for_checklist(soup)

In [58]:
checklist = get_checklist(subIds[110])

In [59]:
checklist

[{'href': '/species/wiltur',
  'name': 'Wild Turkey',
  'media_ids': ['620517016']},
 {'href': '/species/moudov', 'name': 'Mourning Dove', 'media_ids': []},
 {'href': '/species/brthum',
  'name': 'Broad-tailed Hummingbird',
  'media_ids': []},
 {'href': '/species/turvul', 'name': 'Turkey Vulture', 'media_ids': []},
 {'href': '/species/rethaw', 'name': 'Red-tailed Hawk', 'media_ids': []},
 {'href': '/species/lewwoo',
  'name': "Lewis's Woodpecker",
  'media_ids': ['620517065', '620517066']},
 {'href': '/species/norfli', 'name': 'Northern Flicker', 'media_ids': []},
 {'href': '/species/wewpew', 'name': 'Western Wood-Pewee', 'media_ids': []},
 {'href': '/species/wesfly',
  'name': 'Western Flycatcher (Cordilleran)',
  'media_ids': ['620517121']},
 {'href': '/species/weskin',
  'name': 'Western Kingbird',
  'media_ids': ['620517130']},
 {'href': '/species/plsvir',
  'name': 'Plumbeous Vireo',
  'media_ids': ['620517136', '620517137']},
 {'href': '/species/stejay', 'name': "Steller's Jay", 

This shows that all observations in a checklist share the same lat lng stats

In [49]:
import pprint

count = 0
for subId in subIds:
    if count > 2:
        break
    obs = [ob for ob in data if ob["subId"] == subId]
    if len(obs) > 2:
        pprint.pp([(ob['lat'], ob['lng']) for ob in obs])
        print()
        count += 1

[(39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126),
 (39.982874, -105.269126)]

[(40.020217, -105.434369),
 (40.020217, -105.434369),
 (40.020217, -105.434369),
 (40.020217, -105.434369),
 (40.020217, -105.434369),
 (40.020217, -105.434369),
 (40.020217, -105.434369)]

[(39.8860638, -105.2598381),
 (39.8860638, -105.2598381),
 (39.8860638, -105.2598381),
 (39.8860638, -105.2598381),
 (39.8860638, -105.2598381),
 (39.8860638, -105.2598381),
 (39.8860638, -105.2598381),
 (39.8860638, -105.2598381)]



In [29]:
checklists = []
for subId in tqdm(subIds):
    checklist = get_checklist(subId)
    if len(checklist) > 0:
        checklists.append(checklist)

len(checklists)

100%|█████████████████████████████████████████| 111/111 [01:33<00:00,  1.18it/s]


25

In [32]:
subIds[0]

'S182678437'

In [None]:
checklist = get_checklist(subIds[110])

# Getting recent data for region

In [None]:
load_dotenv()
access_token = os.getenv("EBIRD_API_ACCESS_TOKEN")

headers =  {"Content-Type":"application/json", "x-ebirdapitoken": access_token}
region_code = "US-CO-013"
code2 = "AL"

get_url = "https://api.ebird.org/v2/data/obs/{}/recent".format(region_code)
res = requests.get(get_url, headers=headers)
res.json()

In [None]:
res_json = res.json()
print(len(res_json))

In [None]:
max_res = 100
get_checklists_url = "https://api.ebird.org/v2/product/lists/{}?maxResults={}".format(region_code, max_res)
res = requests.get(get_checklists_url, headers=headers, verify=False)
res.json()
res_json = res.json()
print(len(res_json))

In [None]:
res_json[:-10]

# Getting species intro

In [22]:
from selenium import webdriver
load_dotenv()
access_token = os.getenv("EBIRD_API_ACCESS_TOKEN")
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('--headless=new')

path = "https://ebird.org/explore"
driver = webdriver.Chrome(options=options)
# driver = webdriver.Chrome()
# driver.get(path)

In [13]:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

species_name = "WHITE TAILED TROPIC"

input_e = driver.find_element(By.ID, "species")
input_e.send_keys(species_name)
WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "Suggest-suggestion-0"))).click()

species_desc_e = driver.find_element(By.CLASS_NAME, "u-stack-sm")
print(species_desc_e.text)

A medium-sized white seabird with black marks on the wings and a yellow beak. The long white tail streamers are visible from a great distance. Nests on coastal and inland cliffs on the main islands. Often seen flying over canyons and along cliffs. At sea usually flies high above the water. Calls are loud clucks and squawks. Smaller and more graceful in flight than Red-tailed Tropicbird.


# Produce the bird names file

In [11]:
new_bird_names = []
for file in files:
    file_components = file.split("/")
    if int(file_components[-1][0]) > 1: continue
    new_bird_name = file_components[-2]
    new_bird_names.append(new_bird_name)

with open('data/bird_names.txt', 'w+') as f:
    for items in new_bird_names:
        f.write('%s\n' %items)
    print("File written successfully")
f.close()

File written successfully


In [14]:
def get_desc(species_name, path, driver):
    driver.get(path)

    input_e = driver.find_element(By.ID, "species")
    input_e.send_keys(species_name)
    try:
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "Suggest-suggestion-0"))).click()
        species_desc_e = driver.find_element(By.CLASS_NAME, "u-stack-sm")
        return {"status":200, "desc": species_desc_e.text}
    except Exception as e:
        return {"status":400, "error": str(e)}

In [None]:
from tqdm import tqdm

file1 = open('data/bird_names.txt', 'r')
lines = file1.readlines()

failed_names = []
name_desc_pairs = {}
for line in tqdm(lines):
    res = get_desc(line, path, driver)
    if res["status"] == 200:
        name_desc_pairs[line] = res["desc"]
    elif res["status"] == 400:
        print(f"{line} failed: {res["error"]}")
        failed_names.append(line)

print("finished grabbing data..")
print("start writing files")
print("writing failed name file")

# post processing
with open('data/failed_names.txt', 'w+') as f:
    for items in tqdm(failed_names):
        f.write('%s\n' %items)
    print("Failed name file written successfully")
f.close()

import json

name_desc_pairs_obj = json.dumps(name_desc_pairs, indent=4)

print("writing name desc json file")
with open("data/name_desc_pairs.json", "w") as f:
    f.write(name_desc_pairs_obj)
    print("Name desc pair file written successfully")

f.close()

In [17]:
f = open('data/name_desc_pairs.json')

data = json.load(f)

f.close()

new_data = {}
for attribute, value in data.items():
    new_data[attribute[:-1]] = value

data_obj = json.dumps(new_data, indent=4)

print("writing name desc json file")
with open("data/name_desc_pairs.json", "w") as f:
    f.write(data_obj)
    print("Name desc pair file written successfully")

f.close()

writing name desc json file
Name desc pair file written successfully


In [18]:
file1 = open('data/failed_names.txt', 'r')
lines = file1.readlines()

failed_name_desc_pairs = {}
for line in lines:
    failed_name_desc_pairs[line[:-1]]=""

data_obj = json.dumps(failed_name_desc_pairs, indent=4)

print("writing failed name desc json file")
with open("data/failed_name_desc_pairs.json", "w") as f:
    f.write(data_obj)
    print("failed name desc pair file written successfully")

f.close()

writing failed name desc json file
failed name desc pair file written successfully


In [2]:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_desc_gsearch(species_name, path, driver):
    driver.get(path)

    input_e = driver.find_element(By.CLASS_NAME, "gLFyf")
    input_e.send_keys(species_name + " ebird")
    input_e.send_keys(Keys.ENTER)
    try:
        e = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='https://ebird.org/species/']")))
        driver.get(e.get_attribute('href'))
        species_desc_e = driver.find_element(By.CLASS_NAME, "u-stack-sm")
        return {"status":200, "desc": species_desc_e.text}
    except Exception as e:
        return {"status":400, "error": str(e)}

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('--headless=new')

driver = webdriver.Chrome(options=options)

In [7]:
path = "https://www.google.com/"
species_name = "WALL CREAPER"

print(get_desc_gsearch(species_name, path, driver))

{'status': 200, 'desc': 'Es inconfundible pero difícil de detectar en las paredes rocosas. Espectacular en vuelo, con alas anchas negras, rosadas y con manchas blancas, asemejándose a una mariposa gigante. Usualmente se encuentra en las paredes rocosas de alta montaña, pero también se reproduce en abruptas gargantas en altitudes más bajas e incluso se puede encontrar en edificios altos en invierno. El macho reproductor tiene una garganta negra; los machos en invierno, así como las hembras tienen la garganta blanca.'}


In [17]:
import json

file = open('data/failed_name_desc_pairs.json', 'r')
data = json.load(file)
keys = data.keys()

with open('data/failed_names.txt', 'w+') as f:
    for key in keys:
        f.write(f"{key}\n")

In [8]:
from tqdm import tqdm

file = open('data/gsearch_failed_names.txt', 'r')
lines = file.readlines()

gsearch_failed_names = []
gsearch_name_desc_pairs = {}
for line in tqdm(lines):
    res = get_desc_gsearch(line[:-1], path, driver)
    if res["status"] == 200:
        gsearch_name_desc_pairs[line[:-1]] = res["desc"]
    elif res["status"] == 400:
        print(f"{line[:-1]} failed: {res["error"]}")
        gsearch_failed_names.append(line[:-1])

print("finished grabbing data..")
print("start writing files")
print("writing failed name file")

# post processing
with open('data/gsearch_failed_names.txt', 'w+') as f:
    for items in tqdm(gsearch_failed_names):
        f.write('%s\n' %items)
    print("gsearch failed name file written successfully")
f.close()

import json

gsearch_name_desc_pairs_obj = json.dumps(gsearch_name_desc_pairs, indent=4)

print("writing gsearch name desc json file")
with open("data/gsearch_name_desc_pairs.json", "w") as f:
    f.write(gsearch_name_desc_pairs_obj)
    print("Name desc pair file written successfully")

f.close()

100%|███████████████████████████████████████████| 14/14 [00:24<00:00,  1.77s/it]


LOONEY BIRDS failed: Message: 
Stacktrace:
0   chromedriver                        0x0000000100faba20 chromedriver + 4389408
1   chromedriver                        0x0000000100fa432c chromedriver + 4358956
2   chromedriver                        0x0000000100bc0afc chromedriver + 281340
3   chromedriver                        0x0000000100c032c8 chromedriver + 553672
4   chromedriver                        0x0000000100c3bcec chromedriver + 785644
5   chromedriver                        0x0000000100bf7ed0 chromedriver + 507600
6   chromedriver                        0x0000000100bf88a8 chromedriver + 510120
7   chromedriver                        0x0000000100f733a4 chromedriver + 4158372
8   chromedriver                        0x0000000100f77e08 chromedriver + 4177416
9   chromedriver                        0x0000000100f59064 chromedriver + 4051044
10  chromedriver                        0x0000000100f786f4 chromedriver + 4179700
11  chromedriver                        0x0000000100f4c064 c

100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 34379.54it/s]

gsearch failed name file written successfully
writing gsearch name desc json file
Name desc pair file written successfully





In [10]:
file = open('data/name_desc_pairs.json', 'r')
data = json.load(file)
keys = data.keys()

for key in keys:
    with open(f'data/bird_descriptions/{key}.txt', 'w+') as f:
        f.write(f"{data[key]}")
        f.close()