In [325]:
from bs4 import BeautifulSoup
import requests
#from requests_html import AsyncHTMLSession 
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import time

In [418]:
options = Options()
#options.add_argument("--headless")
options.add_argument("start-maximized")
caps = webdriver.DesiredCapabilities().FIREFOX
caps["marionette"] = True

#binary = FirefoxBinary('/home/alex/repos/statics/geckodriver')
driver = webdriver.Firefox(options=options, capabilities=caps, executable_path='../statics/geckodriver')
driver.set_window_size(1920, 1080)
#driver = webdriver.Chrome('../statics/chromedriver')#executable_path='../statics/chromedriver'

In [454]:
def generated_spatial_features_dataframe(path,limit=999999):
    key_value_link_list = []
    key_list = []
    with open(path,'r') as f:
        urls = f.read()
        urls = urls.split('\n')
    
    for url in urls[:limit]:
        key = url.split(':')[2]
        key_value_link_list = key_value_link_list + fetch_group(url,key)
    
    return pd.DataFrame(key_value_link_list,columns=['key','value','osm_wiki_url'])
    

def get(url):
    #loop = asyncio.get_running_loop()
    driver.get(url)
    return driver.page_source

def make_selenium_soup(url):
    #necessary to render javascript some of the tag tables are produced by javascript
    driver.get(url)
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.wikitable.taginfo-taglist")))
        soup = BeautifulSoup(driver.page_source, 'lxml') 
    except:
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.wikitable")))
            soup = BeautifulSoup(driver.page_source, 'lxml') 
        except Exception as e:
            soup = e
    
    return soup   
                                                


def make_soup(url):
    text = get(url)
    soup = BeautifulSoup(text, 'lxml')
    
    return soup

def fetch_group(url,key):
    soup = make_selenium_soup(url)
    tmp_list = []
    key_found = False
    # some of the tag sites do not conform to the schema of the others
    # so they nee to be handeled with special care
    if key not in ['healthcare','industrial','water']:
        for tbody in soup.find_all('tbody'):
            for tr in tbody.find_all('tr'):
                for td in tr.find_all('td'):
                    title = td.text
                    if scrub_text(title) == key:
                        key_found = True
                        continue
                    if key_found:
                        key_found = False
                        value = scrub_text(td.text)
                        try:
                            link = td.a['href']
                        except:
                            print(key,value)
                        tmp_list.append((key,value,f"https://wiki.openstreetmap.org{link}"))
    
    elif key in ['healthcare','industrial']:
        for tbody in soup.find_all('tbody'):
            for tr in tbody.find_all('tr'):
                for td in tr.find_all('td'):
                    title = td.text
                    if (scrub_text(title)[:len(key)] == key) and ('=' in title):
                        value = scrub_text(title.split('=')[1])
                        try:
                            link = td.find(href=True)['href']
                        except:
                            link = None
                        
                        tmp_list.append((key,value,f"https://wiki.openstreetmap.org{link}"))
    
    elif key in ['water']:
        for tbody in soup.find_all('tbody'):
            for tr in tbody.find_all('tr'):
                for td in tr.find_all('td'):
                    for tt in td.find_all('tt'):
                        title = tt.text
                        if scrub_text(title)[:len(key)] == key:
                            value = scrub_text(title.split('=')[1])
                        try:
                            link = tt.find(href=True)['href']
                        except:
                            link = None
                        
                        tmp_list.append((key,value,f"https://wiki.openstreetmap.org{link}"))  
    
    return tmp_list


def get_wikidata(url):
    wikidata_url = None
    
    try:
        soup = make_soup(url)
        wikidata_url = soup.find('tr', {'class' : 'd_wikidata content'}).a['href']
    
    except Exception as e:
        wikidata_url = e
    
    return wikidata_url

def get_tag_info(url):
    
    tag_info = None
    
    try:
        soup = make_soup(url)
        iframe_url = soup.find('tr', {'class' : 'd_taginfo content'}).iframe['src']
        soup = make_soup(f'https:{iframe_url}')
        #tag_info = soup.find('div',{'class':'content'}).find_all()
        tag_info = list(soup.find_all('td'))
        # lots of cleaning and filtering is going on here
        # first we only want those rows where the title contains 'in database'
        # second we replace \u202f with nothing and then turn our numbers into ints
        tag_info = [int(str(td.contents[0]).replace('\u202f','')) for td in tag_info if 'in database' in  str(td.get('title'))]
        tag_info = list(zip(['nodes','ways','relations'],tag_info))
    
    except Exception as e:
        tag_info = e

    return tag_info
    

def get_wikipedia_urls(url):
    
    temp_dict = {}
    soup = make_soup(url)
    wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
    
    # if the page does not contain a class=wikibase-sitelinklistview we ignore it
    if wikibase:
        for li in wikibase.find_all('li'):
            link = li.find('a')['href']
            lang = li.findAll('span')[3]['lang']
            temp_dict[lang]=link
    else:
        temp_dict = None
    
    return temp_dict

def scrub_text(text):
    # Drop footnote superscripts in brackets
    text = re.sub(r'\[.*?\]+', '', text)
    # Replace all non word charcters with a white space
    text = re.sub('\W', ' ',text)
    # trim all white spaces >2 to 1 
    text = re.sub('\s{2,}',' ',text)
    #remove whitspace(s) at either end of the string
    text = text.strip()
    
    return text.lower()


def plain_text_from_wiki(url):
    soup = make_soup(url)
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.text
    plain_text = scrub_text(text)
    return plain_text

In [6]:
kv_df = generated_spatial_features_dataframe('./osm_groups.txt')

In [73]:
kv_df['wikidata_url'] = kv_df['osm_wiki_url'].apply(get_wikidata)

In [79]:
kv_df['tag_counts'] = kv_df['osm_wiki_url'].apply(get_tag_info)    

In [113]:
pd.set_option('display.max_colwidth', None)
for row in list(kv_df[~kv_df['wikidata_url'].str.contains('https://www.wikidata.org/wiki/',na=False)].iterrows())[85:]:
    print(row[1])
pd.set_option('display.max_colwidth', 50)

key                                                                                                                                                                                                                                                                                                                                                                            railway
value                                                                                                                                                                                                                                                                                                                                                                        abandoned
osm_wiki_url                                                                                                                                                                                                                                              

In [456]:
with open(('./osm_groups.txt'),'r') as f:
    urls = f.read()
    urls = urls.split('\n')

for url in urls[:1]:
    #https://wiki.openstreetmap.org/wiki/Key:industrial,https://wiki.openstreetmap.org/wiki/Key:water
    url = 'https://wiki.openstreetmap.org/wiki/Key:water'
    print(url)
    #key = url.split(':')[2]
    s = get_tag_soup(url)
    #print(fetch_group(url,key)[:3])

    print("####################")

https://wiki.openstreetmap.org/wiki/Key:water
####################


In [467]:
s.find_all('tbody')[0].find_all('tr')[2].find_all('td')[0].find_all('tt')[1].find(href=True) #.find(href=True)['href']

<a href="/wiki/Tag:water%3Dlake" title="Tag:water=lake"><bdi>lake</bdi></a>

In [412]:
scrub_text(s.find_all('tbody')[0].find_all('tr')[2].find('td').text).strip()

'amenity'

In [None]:
# there exist some edge cases where a wikidata_url should be added
# but because of the osm wiki structure it is not (for example building:yes)
# so we load the edge cas list and add them to the dataframe


# if edge cases can not resolve a wikidata url for a given key,value pair
# the wikidata url of Key should be taken

In [None]:
#some wikidata_urls contain an except these need to be filtered out
kv_df = kv_df.loc[kv_df['wikidata_url'].astype(str).str.contains('http')]
kv_df['wikipedia_dict'] = kv_df['wikidata_url'].apply(get_wikipedia_urls)
kv_df

In [None]:
# now we just select the english 'en' wikipedia entries and drop those rows that don't have an english wikipedia entry
kv_df_just_eng = pd.concat([kv_df.drop(['wikipedia_dict'], axis=1), kv_df['wikipedia_dict'].apply(pd.Series)], axis=1)[['key','value','en']].dropna()

In [None]:
# lastly we fetch the text
kv_df_just_eng['en_text'] = kv_df_just_eng['en'].apply(plain_text_from_wiki)

In [None]:
# and save the dataframe
kv_df_just_eng.to_pickle('./kv_df_just_eng.pickle')

In [None]:
kv_df_just_eng

In [None]:
response = requests.get(small_df.loc[3]['wikidata_url'])
print(response.text)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.findAll("div", {"class": "stylelistrow"})
wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
for li in wikibase.find_all('li'):
    print(li.findAll('span')[3]['lang'])
#     link = li.find('a')['href']
#     lang = li.findAll('span')[1]['title']
    
    