In [79]:
from bs4 import BeautifulSoup
import requests
#from requests_html import AsyncHTMLSession 
import pandas as pd
import numpy as np 
import re

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from pandarallel import pandarallel

import time

In [3]:
pandarallel.initialize(nb_workers=8)

options = Options()
options.add_argument("--headless")
#options.add_argument("start-maximized")
caps = webdriver.DesiredCapabilities().FIREFOX
caps["marionette"] = True

#binary = FirefoxBinary('/home/alex/repos/statics/geckodriver')
driver = webdriver.Firefox(options=options, capabilities=caps, executable_path='../statics/geckodriver')
#driver.set_window_size(1920, 1080)
#driver = webdriver.Chrome('../statics/chromedriver')#executable_path='../statics/chromedriver'

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [111]:
def generated_spatial_features_dataframe(path,limit=999999):
    key_value_link_list = []
    
    with open(path,'r') as f:
        urls = f.read()
        urls = urls.split('\n')
    
    for url in urls[:limit]:
        key = url.split(':')[2]
        key_value_link_list = key_value_link_list + fetch_group(url,key)
    
    return pd.DataFrame(key_value_link_list,columns=['key','value','osm_wiki_url'])
    

def get(url):
    #loop = asyncio.get_running_loop()
    r = requests.get(url)
    assert r.ok, (r.status_code,url) 
    return r.text

def make_selenium_soup(url):
    #necessary to render javascript some of the tag tables are produced by javascript
    driver.get(url)
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.wikitable.taginfo-taglist")))
        soup = BeautifulSoup(driver.page_source, 'lxml') 
    except:
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.wikitable")))
            soup = BeautifulSoup(driver.page_source, 'lxml') 
        except Exception as e:
            soup = e
    
    return soup   
                                                


def make_soup(url):
    text = get(url)
    soup = BeautifulSoup(text, 'lxml')
    
    return soup

def fetch_group(url,key):
    soup = make_selenium_soup(url)
    tmp_list = []
    key_found = False
    # some of the tag sites do not conform to the schema of the others
    # so they nee to be handeled with special care
    if key not in ['healthcare','industrial','water']:
        for tbody in soup.find_all('tbody'):
            for tr in tbody.find_all('tr'):
                for td in tr.find_all('td'):
                    title = td.text
                    if scrub_text(title) == key:
                        key_found = scrub_text(title)
                        continue
                    if key_found:
                        value = scrub_text(td.text)
                        try:
                            link = td.a['href']
                        except:
                            link = None
                            
                        tmp_list.append((key_found,value,fix_link(link)))
                        key_found = False
    
    elif key in ['healthcare','industrial']:
        for tbody in soup.find_all('tbody'):
            for tr in tbody.find_all('tr'):
                for td in tr.find_all('td'):
                    title = td.text
                    if (scrub_text(title)[:len(key)] == key) and ('=' in title):
                        value = scrub_text(title.split('=')[1])
                        key_found = scrub_text(title.split('=')[0])
                        try:
                            link = td.find(href=True)['href']
                        except:
                            link = None
                        
                        tmp_list.append((key_found,value,fix_link(link)))
    
    elif key in ['water']:
        for tbody in soup.find_all('tbody'):
            for tr in tbody.find_all('tr'):
                for td in tr.find_all('td'):
                    for tt in td.find_all('tt'):
                        title = tt.text
                        if (title.split('=')[0] == key) and ('=' in title):
                            value = scrub_text(title.split('=')[1])
                            key_found = scrub_text(title.split('=')[0])
                            
                            try:
                                link = tt.find(href=True)['href']
                            except:
                                link = None
                            
                            tmp_list.append((key_found,value,fix_link(link)))  
    
    return tmp_list


def get_wikidata(url):
    wikidata_url = None
    
    try:
        soup = make_soup(url)
        wikidata_url = soup.find('tr', {'class' : 'd_wikidata content'}).a['href']
    
    except Exception as e:
        wikidata_url = e
    
    return wikidata_url

def fix_link(link):
    
    if link:
        if not link[:4] == 'http':
            link = f"https://wiki.openstreetmap.org{link}"            
    
    return link


def get_tag_info(url):
    
    tag_info = None
    
    try:
        soup = make_soup(url)
        iframe_url = soup.find('tr', {'class' : 'd_taginfo content'}).iframe['src']
        soup = make_soup(f'https:{iframe_url}')
        #tag_info = soup.find('div',{'class':'content'}).find_all()
        tag_info = list(soup.find_all('td'))
        # lots of cleaning and filtering is going on here
        # first we only want those rows where the title contains 'in database'
        # second we replace \u202f with nothing and then turn our numbers into ints
        tag_info = [int(str(td.contents[0]).replace('\u202f','')) for td in tag_info if 'in database' in  str(td.get('title'))]
        tag_info = list(zip(['nodes','ways','relations'],tag_info))
    
    except Exception as e:
        tag_info = e

    return tag_info
    

def get_wikipedia_urls(url):
    
    temp_dict = {}
    soup = make_soup(url)
    wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
    wikidata_desc = soup.find('div',{'class','wikibase-entitytermsview-heading-description'})
    
    #here we also fetch the short description of the elemen on wikidata

    if wikibase:
        for li in wikibase.find_all('li'):
            link = li.find('a')['href']
            lang = li.findAll('span')[3]['lang']
            temp_dict[lang]=link
    if wikidata_desc:
        temp_dict['wikidata_desc'] = scrub_text(wikidata_desc.text)
        
    if not temp_dict:
        # later on None is easier to handle
        temp_dict = None
    
    return temp_dict

def scrub_text(text):
    # Drop footnote superscripts in brackets
    text = re.sub(r'\[.*?\]+', '', text)
    # Replace all non word charcters with a white space
    text = re.sub('\W', ' ',text)
    # trim all white spaces >2 to 1 
    text = re.sub('\s{2,}',' ',text)
    #remove whitspace(s) at either end of the string
    text = text.strip()
    
    return text.lower()


def plain_text_from_wiki(url):
    if url:
        soup = make_soup(url)
        text = ''
        for paragraph in soup.find_all('p'):
            text += paragraph.text
        plain_text = scrub_text(text)
    else:
        plain_text = None
    
    return plain_text


def get_wikidata_key(data):
    # the idea of this function is to get the wikidata url for a given key
    # this function is only invocted if the key=value combinations did not
    # produce a useable wikidata url as an backup we might want to use just the
    # wikidata url of the key
    
    key,wikidata_url = data
    try:
        if not 'https://www.wikidata.org/wiki/' in wikidata_url:    
            url = f'https://wiki.openstreetmap.org/wiki/Key:{key}'
            wikidata_url = get_wikidata(url)
    except:
        # sometimes wikidata_url isnt a string so we catch this here
        url = f'https://wiki.openstreetmap.org/wiki/Key:{key}'
        wikidata_url = get_wikidata(url)
      
    # this needs a second stage because not all keys have working wikidata url
    # so I compiled a list of edge cases that we load here and try to match 
    if not 'https://www.wikidata.org/wiki/' in wikidata_url:
        with open('./osm/osm_key_edgecases.txt','r') as f:
            special_key_cases = f.read()
            special_key_cases = special_key_cases.split('\n')
            special_key_cases =dict([(case.split(',')[0],case.split(',')[1]) for case in special_key_cases])
            wikidata_url = special_key_cases.get(key)
            
    return wikidata_url

In [17]:
kv_df = generated_spatial_features_dataframe('./osm/osm_groups.txt')

In [19]:
kv_df['wikidata_url'] = kv_df['osm_wiki_url'].parallel_apply(get_wikidata)

In [20]:
kv_df['tag_counts'] = kv_df['osm_wiki_url'].parallel_apply(get_tag_info)

In [72]:
#load edge cases here
#/osm_wikidata_edgecases.txt
with open('./osm/osm_key_value_edgecases.txt','r') as f:
    special_cases = f.read()
    special_cases = special_cases.split('\n')

for special_case in special_cases:
    key,value,wikidata_url = special_case.split(',')
    kv_df.loc[(kv_df['key'] == key) & (kv_df['value']==value), 'wikidata_url'] = wikidata_url

In [51]:
# if edge cases can not resolve a wikidata url for a given key,value pair
# the wikidata url of Key should be taken
kv_df['wikidata_url'] = kv_df[['key','wikidata_url']].apply(get_wikidata_key,axis=1)

In [59]:
# lastly I fix the information=trail_blaze;information=route_marker key values
# because they are kind of broken
kv_df = kv_df.drop(kv_df.loc[(kv_df['key'] == 'information') & (kv_df['value']=='trail_blaze route_marker')].index)

for value in ['trail_blaze','route_marker']:
    kv_df.append({'key':'information', 
                  'value':value, 
                  'osm_wiki_url':f'https://wiki.openstreetmap.org/wiki/Tag:information%3D{value}', 
                  'wikidata_url':'https://www.wikidata.org/wiki/Q1042490', 
                  "tag_counts":[]}, ignore_index=True)

In [67]:
pd.set_option('display.max_colwidth', None)
for row in list(kv_df[~kv_df['wikidata_url'].str.contains('https://www.wikidata.org/wiki/',na=False)].iterrows())[:]:
    print(row[1])
pd.set_option('display.max_colwidth', 50)

key                                                             route
value                                                             bus
osm_wiki_url    https://wiki.openstreetmap.org/wiki/Tag%3Aroute%3Dbus
wikidata_url                                                     None
tag_counts           [(nodes, 72), (ways, 1266), (relations, 206737)]
Name: 947, dtype: object
key                                                                    route
value                                                             trolleybus
osm_wiki_url    https://wiki.openstreetmap.org/wiki/Tag%3Aroute%3Dtrolleybus
wikidata_url                                                            None
tag_counts                       [(nodes, 0), (ways, 54), (relations, 3042)]
Name: 948, dtype: object
key                                                                 route
value                                                             minibus
osm_wiki_url    https://wiki.openstreetmap.org/wiki/Tag%3Aroute%3Dm

In [74]:
#some wikidata_urls contain an except these need to be filtered out
kv_df = kv_df.loc[kv_df['wikidata_url'].astype(str).str.contains('http')]
kv_df['wikipedia_dict'] = kv_df['wikidata_url'].parallel_apply(get_wikipedia_urls)

In [108]:
# now we just select the english 'en' wikipedia entries and 'wikidata_desc' by first unpacking the 'wikipedia_dict'
kv_df_just_eng = pd.concat([kv_df.drop(['wikipedia_dict'], axis=1), kv_df['wikipedia_dict'].apply(pd.Series)], axis=1)[['key','value','en','wikidata_desc']]
# and drop those rows that have niether 'en' nor 'wikipedia_dict'
kv_df_just_eng = kv_df_just_eng.drop(kv_df_just_eng.loc[kv_df_just_eng['en'].isnull() & kv_df_just_eng['wikidata_desc'].isnull()].index)
# also we replac all pandas nan with None because handling them is easier
kv_df_just_eng = kv_df_just_eng.where(pd.notnull(kv_df_just_eng), None)

In [112]:
# lastly we fetch the text
kv_df_just_eng['en_text'] = kv_df_just_eng['en'].parallel_apply(plain_text_from_wiki)

In [113]:
# and save the dataframe
kv_df_just_eng.to_pickle('./kv_df_just_eng.pickle')

In [114]:
kv_df_just_eng

Unnamed: 0,key,value,en,wikidata_desc,en_text
0,amenity,bar,https://en.wikipedia.org/wiki/Bar,establishment serving alcoholic beverages for ...,a bar is a long raised narrow table or bench d...
1,amenity,bbq,,structure for open air cooking,
2,amenity,biergarten,https://en.wikipedia.org/wiki/Beer_garden,outdoor area in which beer other drinks and lo...,a beer garden a loan translation from the germ...
3,amenity,cafe,https://en.wikipedia.org/wiki/Coffeehouse,establishment that serves coffee and tea,a coffeehouse coffee shop or café is an establ...
4,amenity,drinking_water,,place for doing something,
...,...,...,...,...,...
1145,water,reflecting_pool,https://en.wikipedia.org/wiki/Reflecting_pool,water feature found in gardens parks and at me...,a reflecting pool or reflection pool is a wate...
1146,water,moat,https://en.wikipedia.org/wiki/Moat,dry or watery ditch surrounding a fortificatio...,a moat is a deep broad ditch either dry or fil...
1147,water,pond,https://en.wikipedia.org/wiki/Pond,body of standing water either natural or man m...,a pond is an area filled with water either nat...
1148,water,wastewater,https://en.wikipedia.org/wiki/Wastewater,water that has been affected by human use,wastewater or waste water is any water that ha...
