# WikiAves Data Scrape

## Software needed before running the following scripts:
1. Create a clone of the ArcGIS Pro Python Environment (See https://pro.arcgis.com/en/pro-app/arcpy/get-started/what-is-conda.htm)
2. Activate the cloned ArcGIS Pro Python Environment in ArcGIS Pro
3. Install **beautifulsoup4** using the Python Package Manager

## Starting Jupyter Notebook from Python Command Prompt
(arcgispro-py3-clone) C:\Users\whitacrej\AppData\Local\ESRI\conda\envs\arcgispro-py3-clone>r:

(arcgispro-py3-clone) R:\>cd PARC\GIS\Projects\PARC_Brazil

(arcgispro-py3-clone) R:\PARC\GIS\Projects\PARC_Brazil>jupyter notebook

In [None]:
from bs4 import BeautifulSoup
import concurrent.futures as cf
import csv
from datetime import datetime
import os
import pandas as pd
import re
import requests as req

In [None]:
def create_id_lists(start, stop, records_per):
    ''''''
    ids_list = []
    while start <= stop:
        istop = start + records_per
        if istop < stop:
            ids_list.append(range(start, istop))
        else:
            ids_list.append(range(start, stop + 1))

        start += records_per

    return ids_list

def wikiaves_scrape(i):
    d = []
    # Get general details
    url = f'https://www.wikiaves.com.br/_midia_detalhes.php?m={i}'
    r = req.get(url)
    text = r.text
    soup = BeautifulSoup(text, 'html.parser')
    if not soup.findAll('div', attrs={'class':'alert alert-danger'}):
        wa_id = [['Codigo', div.text.strip()] for div in soup.findAll('div', attrs={'class': 'wa-recordid'})]
        wa_name = [['NomeComum', a.text.strip()] for a in soup.findAll('a', attrs={'class': 'wa-id m-link font-poppins'})]
        wa_sp = [['Especie', t.text.strip()] for a in soup.findAll('a', attrs={'class': 'm-link'}) for t in a.findAll('i')]
        wa_data = [t.text.strip().replace('\xa0', ' ').split(': ') for div in soup.findAll('div', attrs={'class': 'wa-lista-detalhes'}) 
                   for t in div.findAll('div', attrs={'class': '', 'id': ''}) 
                   if t.text.strip().replace('\xa0', ' ') != 'Local de Observação:']
        d.extend(wa_id + wa_name + wa_sp + wa_data)
        # Get EXIF details
        url = f'https://www.wikiaves.com.br/getExifInfo.php?f={i}'
        r = req.get(url)
        text = r.text
        soup = BeautifulSoup(text, 'html.parser')
        if soup.findAll('div'):
            wa_exif = [div.text.strip().replace('\xa0', ' ').split(': ') for div in soup.findAll('div')
                       if div.text.strip().replace('\xa0', ' ').split(': ') not in d]
            d.extend(wa_exif)
    # Clean up data
    if d:
        d = {item[0]: item[1] for item in d if len(item) > 1}
        return d


In [None]:
# Records on 2020-03-20 @ 1500 EDT
# Photos: 2,901,429
# Sounds: 178,136
# Total: 3,079,565
# Last Record ID: 3725764

''' Paramters '''
start = 1 # or other number
stop = 3754000 # or other number
records_per = 2000 # or 5000

# Output CSV File Folder
out_csv_name = 'WikiAves_Scrape'
out_folder = r'R:\PARC\GIS\Projects\PARC_Brazil\RawData\WikiAves_WebScrapes\WikiAves_202004'

In [None]:
# Create ID lists
ids_list = create_id_lists(start, stop, records_per)

print(ids_list)
print(f'CSV Files to be created: {len(ids_list)}')

In [None]:
# Threaded WikiAves Scrape

start_time = datetime.now()

for ids in ids_list:
    i_start_time = datetime.now()
    with cf.ThreadPoolExecutor() as pool:
        data = pool.map(wikiaves_scrape, ids)
    
    # Create output CSV file name and folder
    data_frame = pd.DataFrame(filter(None, data))
    csv_time = datetime.now().strftime('%Y%m%d_%H%M%S')
    out_csv = os.path.join(out_folder, f'{out_csv_name}_{ids[0]}-{ids[-1]}_{csv_time}.csv')
    data_frame.to_csv(out_csv, index=False)
    
    i_end_time = datetime.now()
    i_time =str(i_end_time - i_start_time)
                           
    print(f'CSV Created in {i_time}: {out_csv}')

end_time = datetime.now()
total_time = str(end_time - start_time)

print(f'Total Time: {total_time}')

# Non-threaded code...Do Not Run!

In [None]:
# Non-threaded (for comparison only...)

start_time = datetime.now()

for ids in ids_list:
    i_start_time = datetime.now()
    data = []
    for i in ids:
        d = wikiaves_scrape(i)
        data.append(d)
    
    # Create output CSV file name and folder
    data_frame = pd.DataFrame(filter(None, data))
    csv_time = datetime.now().strftime('%Y%m%d_%H%M%S')
    out_csv = os.path.join(out_folder, f'{out_csv_name}_{ids[0]}-{ids[-1]}_{csv_time}.csv')
    data_frame.to_csv(out_csv, index=False)
    
    i_end_time = datetime.now()
    i_time =str(i_end_time - i_start_time)
                           
    print(f'CSV Created in {i_time}: {out_csv}')

end_time = datetime.now()
total_time = str(end_time - start_time)

print(f'Total Time: {total_time}')