## Use PollenLibrary.com to create a metadata table for pollen counting and organizing

In [None]:
# Generate Latin Name Metadata table from pollenlibrary.com

## Scrape Webpages and store html pages in raw

WEBSITE_URL = 'https://www.pollenlibrary.com'

## pollens
import requests
import json
import re
from bs4 import BeautifulSoup
import datetime
from dateutil.parser import parse
import string
import random

def clean_string(s):
    if isinstance(s, str):
        return re.sub("[\W_]+",'',s)
    else:
        return s

def re_braces(s):
    if isinstance(s, str):
        return re.sub("[\(\[].*?[\)\]]", "", s)
    else:
        return s

def name_filter(s):
    s = clean_string(s)
    s = re_braces(s)
    s = str(s)
    s = s.replace(' ', '').lower()
    return s
def get_webpage(url):
    res = requests.get(url)
    if res.status_code == 200:
        return res.text
    return None


def get_webpage_soup(html, html_attr=None, attr_key_val=None):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        if html_attr:
            soup = soup.find(html_attr, attr_key_val)
        return soup
    return None

def get_all_pollen_types_for_letter(letter:str):
    '''
    Return all sublinks of a navigation page
    '''
    html = get_webpage(f"{WEBSITE_URL}/SEARCH/LETTER/{letter}")
    soup = get_webpage_soup(html)
    a_links = soup.find_all('a')
    if len(a_links) == 0:
        return []

    return [f"{WEBSITE_URL}{a.get('href')}" for a in a_links if '/Specie/' in a.get('href')]

def get_subpage_content(url:str):
    '''
    Scrape the content part of the sub page
    '''
    html = get_webpage(url)
    soup = get_webpage_soup(html)
    content = soup.find('div',{'id':'content'})
    return {
        'url':url,
        'content':str(content),
        'scraped_at':datetime.datetime.now()
    }

class PollenTaxonomyInfo:
    def __init__(
        self,
        friendly_name,
        family,
        genus,
        species,
        latin_name,
        description,
        sub_group_description,
        allergenicity,
        allergy_level,
        pollination_season_text,
        wiki_link=None,
        kingdom = None,
        division = None,
        class_ = None,
        order = None
        ):
        self.friendly_name = friendly_name
        self.family = family
        self.genus = genus
        self.species = species
        self.latin_name = latin_name
        self.description = description
        self.sub_group_description = sub_group_description
        self.allergenicity = allergenicity
        self.allergy_level = allergy_level
        self.pollination_season_text = pollination_season_text
        self.wiki_link = wiki_link
        self.kingdom = kingdom
        self.division = division
        self.class_ = class_
        self.order = order
        self.id = f"{name_filter(self.family)}_{name_filter(self.genus)}_{name_filter(self.species)}"

def parse_subpage_content(content:str):
    '''
    Use BeautifulSoup to pull out content metadata
    '''
    soup = get_webpage_soup(content)
    taxonomy_info = soup.find('div',{'id':'main_lib_content_taxonomy'})
    ### taxonomy sub groups
    family = taxonomy_info.find('span', class_='headerGrn2', string='Family: ').find_next('a').text.lower()
    genus = taxonomy_info.find('span', class_='headerGrn2', string='Genus: ').find_next('a').text
    species_group = taxonomy_info.find('span', class_='headerGrn2', string='Species: ').find_next('i').text
    friendly_name = re_braces(species_group).strip()
    species = re.search(r'\((.*?)\)', species_group).group(1).split(' ')[1]
    latin_name = re.search(r'\((.*?)\)', species_group).group(1).strip()
    try:
        description = soup.find('div',{'id':'main_lib_content_divNativity'}).text
    except Exception as e:
        print('ERR SOUP Description')
        description = None

    try:
        allergenicity = soup.find('div',{'id':'main_lib_content_divAllergenicity'}).text.split(': ')[1]
    except Exception as e:
        print('ERR SOUP allergenicity')
        allergenicity = None
    if allergenicity is not None:
        if 'no allergy' in allergenicity.lower():
            allergy_level = 0
        elif 'mild allergen' in allergenicity.lower():
            allergy_level = 1
        elif 'moderate allergen' in allergenicity.lower():
            allergy_level = 2
        elif 'severe allergen' in allergenicity.lower():
            allergy_level = 3
        else:
            allergy_level = -1
    else:
        allergy_level = -1

    try:
        pollination = soup.find('div',{'id':'main_lib_content_divPollination'}).text.split(': ')[2]
    except Exception as e:
        print('ERR SOUP pollination')
        pollination = None
    try:
        plant_group = soup.find('div',{'id':'main_lib_content_divPlantGroup'}).text
    except Exception as e:
        print('ERR SOUP plant_group')
        plant_group = None

    try:
        alt_links = soup.find('div',{'id':'main_lib_content_genusLinks'})
    except Exception as e:
        print('ERR SOUP alt_links')
        alt_links = None
    if alt_links is not None:
        wiki_link = [a.get('href') for a in alt_links.find_all('a') if 'wikipedia.org' in a.get('href')][0]
    else:
        wiki_link = None
    wiki_data = wiki_subparse(wiki_url=wiki_link)

    obj = PollenTaxonomyInfo(
            friendly_name,
            family = wiki_data['family'] if wiki_data['family'] is not None else family,
            genus = wiki_data['genus'] if wiki_data['genus'] is not None else genus,
            species = wiki_data['species'] if wiki_data['species'] is not None else species,
            latin_name = wiki_data['latin_name'] if wiki_data['latin_name'] is not None else latin_name,
            description=description,
            sub_group_description=plant_group,
            allergenicity=allergenicity,
            allergy_level=allergy_level,
            pollination_season_text = pollination,
            wiki_link=wiki_link,
            kingdom = wiki_data['kingdom'],
            division = wiki_data['division'],
            class_ = wiki_data['class'],
            order = wiki_data['order'],
    )

    return obj.__dict__


def wiki_subparse(wiki_url:str):
    '''
    Determine full taxonomy tree from wiki page
    '''
    taxonomy_tree = {
        'kingdom':None,
        'division':None,
        'class':None,
        'order':None,
        'family':None,
        'genus':None,
        'species':None,
        'latin_name':None
    }
    if wiki_url is None:
        return taxonomy_tree

    html = get_webpage(wiki_url)
    soup = get_webpage_soup(html)
    categories = ['Kingdom', 'Division', 'Class', 'Order', 'Family', 'Genus', 'Species']
    try:
        for category in categories:
            for td in soup.find_all('td'):
                if category in td.text:
                    value = td.find_next_sibling('td').text.strip().replace('.\xa0',' ')
                    taxonomy_tree[category.lower()] = value.split(' ')[1] if category == 'Species' else value
                    break
        taxonomy_tree['latin_name'] =soup.find('span',class_='binomial').find_next('i').text.strip()
    except Exception as e:
        print(e)
    return taxonomy_tree


def runner():
    taxonomy_objs = []
    for letter in string.ascii_uppercase:
        letter_pollen_types = get_all_pollen_types_for_letter(letter)
        #if len(letter_pollen_types) > 3:
        #    letter_pollen_types = random.sample(letter_pollen_types,3)
        for pollen_type_url in letter_pollen_types:
            content_obj = get_subpage_content(pollen_type_url)
            pollen_info_obj = parse_subpage_content(content_obj['content'])
            pollen_info_obj['scrape_url'] = content_obj['url']
            pollen_info_obj['scraped_at'] = content_obj['scraped_at']
            taxonomy_objs.append(pollen_info_obj)

    return taxonomy_objs

In [None]:
taxonomy_dict = runner()

'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_next'
'NoneType' object has no attribute 'find_next'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_next'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_next'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_next'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribut

In [None]:
import pandas as pd
df = pd.DataFrame(taxonomy_dict)

In [None]:
#df.to_json('../gdrive/MyDrive/datalake-raw/raw/data/pollenlibrary/full.json',orient='records')

In [None]:
df.family = df.family.str.lower()
df

Unnamed: 0,friendly_name,family,genus,species,latin_name,description,sub_group_description,allergenicity,allergy_level,pollination_season_text,wiki_link,kingdom,division,class_,order,id,scrape_url,scraped_at
0,Aaron's-Beard,hypericaceae,Hypericum,calycinum,Hypericum calycinum,These plants are from foreign areas (those tha...,Angiosperm - Flowering Dicot: Plants in this g...,No allergy has been reported for Aaron's-Beard...,0,Summer to Fall.,http://en.wikipedia.org/wiki/Hypericum_calycinum,Plantae,,,Malpighiales,hypericaceae_hypericum_calycinum,https://www.pollenlibrary.com/Specie/Hypericum...,2023-06-22 18:53:38.433883
1,Alaska-Cedar,cupressaceae,CallitropsisOerst.,nootkatensis,Callitropsis nootkatensis,This species is native to North America north ...,Gymnosperm: Any plant such as a conifer whose ...,Alaska-Cedar (Chamaecyparis nootkatensis) is a...,2,Spring.,http://en.wikipedia.org/wiki/Chamaecyparis_noo...,Plantae,Pinophyta,Pinopsida,Cupressales,cupressaceae_callitropsisoerst_nootkatensis,https://www.pollenlibrary.com/Specie/Chamaecyp...,2023-06-22 18:53:39.035442
2,Alder-Leaf Mountain-Mahogany,rosaceae,Cercocarpus,montanus,Cercocarpus montanus,This species is native to North America north ...,Angiosperm - Flowering Dicot: Plants in this g...,Alder-Leaf Mountain-Mahogany (Cercocarpus mont...,1,all year long.,http://en.wikipedia.org/wiki/Cercocarpus_montanus,Plantae,,,Rosales,rosaceae_cercocarpus_montanus,https://www.pollenlibrary.com/Specie/Cercocarp...,2023-06-22 18:53:39.688714
3,Alfalfa,fabaceae,Medicago,sativa,Medicago sativa,These plants are from foreign areas (those tha...,Angiosperm - Flowering Dicot: Plants in this g...,Alfalfa (Medicago sativa) is a moderate allerg...,2,Spring to Fall.,http://en.wikipedia.org/wiki/Medicago_sativa,Plantae,,,Fabales,fabaceae_medicago_sativa,https://www.pollenlibrary.com/Specie/Medicago+...,2023-06-22 18:53:40.255144
4,Allegheny-Chinkapin,fagaceae,Castanea,pumila,Castanea pumila,This species is native to North America north ...,Angiosperm - Flowering Dicot: Plants in this g...,Allegheny-Chinkapin (Castanea pumila) is a mil...,1,Winter to Summer.,http://en.wikipedia.org/wiki/Castanea_pumila,Plantae,,,Fagales,fagaceae_castanea_pumila,https://www.pollenlibrary.com/Specie/Castanea+...,2023-06-22 18:53:41.171279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,Yellow Flat Sedge,cyperaceae,Cyperus,flavescens,Cyperus flavescens,This species is native to North America north ...,Angiosperm - Flowering Monocot: Plants in this...,Yellow Flat Sedge (Cyperus flavescens) is a mi...,1,Summer to Winter.,http://en.wikipedia.org/wiki/Cyperus_flavescens,Plantae,,,Poales,cyperaceae_cyperus_flavescens,https://www.pollenlibrary.com/Specie/Cyperus+f...,2023-06-22 19:04:51.652761
1001,Yellow Indian Grass,poaceae,Sorghastrum,nutans,Sorghastrum nutans,This species is native to North America north ...,Angiosperm - Flowering Monocot: Plants in this...,Yellow Indian Grass (Sorghastrum nutans) is a ...,1,Summer to Fall.,http://en.wikipedia.org/wiki/Sorghastrum_nutans,Plantae,,,Poales,poaceae_sorghastrum_nutans,https://www.pollenlibrary.com/Specie/Sorghastr...,2023-06-22 19:04:52.219600
1002,Yellow Sweet-Clover,fabaceae,Melilotus,officinalis,Melilotus officinalis,These plants are from foreign areas (those tha...,Angiosperm - Flowering Dicot: Plants in this g...,Yellow Sweet-Clover (Melilotus officinalis) is...,1,Spring to Fall.,http://en.wikipedia.org/wiki/Melilotus_officin...,Plantae,,,Fabales,fabaceae_melilotus_officinalis,https://www.pollenlibrary.com/Specie/Melilotus...,2023-06-22 19:04:52.809843
1003,Yerba-de-Pasmo,asteraceae,Baccharis,pteronioides,Baccharis pteronioides,This species is native to North America north ...,Angiosperm - Flowering Dicot: Plants in this g...,Yerba-de-Pasmo (Baccharis pteronioides) is a s...,3,Spring to Summer.,http://en.wikipedia.org/wiki/Baccharis_pteroni...,Plantae,,,Asterales,asteraceae_baccharis_pteronioides,https://www.pollenlibrary.com/Specie/Baccharis...,2023-06-22 19:04:53.397975
