In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [78]:
def generated_spatial_features_dataframe(path,limit=999999):
    key_value_link_list = []
    with open(path,'r') as f:
        urls = f.read()
        urls = urls.split('\n')
    
    for url in urls[:limit]:
        key_value_link_list = key_value_link_list + fetch_group(url)
    
    return pd.DataFrame(key_value_link_list,columns=['key','value','osm_wiki_url'])
    

def make_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    return soup

def fetch_group(url):
    soup = make_soup(url)
    tmp_list = []

    for tbody in soup.find_all('tbody'):
        for tr in tbody.find_all('tr'):
            for td in tr.find_all('td'):
                found_key = None
                try:
                    title = td.a['title']
                    if 'Tag:' in title:
                        key,value = title[4:].split('=')
                        tmp_list.append((key,value,f"https://wiki.openstreetmap.org{td.a['href']}"))
                except:
                    pass
    
    return tmp_list

def get_wikidata(url):
    wikidata_url = None
    
    try:
        soup = make_soup(url)
        wikidata_url = soup.find('tr', {'class' : 'd_wikidata content'}).a['href']
    
    except Exception as e:
        wikidata_url = e
    
    return wikidata_url

def get_tag_info(url):
    
    tag_info = None
    
    try:
        soup = make_soup(url)
        iframe_url = soup.find('tr', {'class' : 'd_taginfo content'}).iframe['src']
        soup = make_soup(f'https:{iframe_url}')
        #tag_info = soup.find('div',{'class':'content'}).find_all()
        tag_info = list(soup.find_all('td'))
        # lots of cleaning and filtering is going on here
        # first we only want those rows where the title contains 'in database'
        # second we replace \u202f with nothing and then turn our numbers into ints
        tag_info = [int(str(td.contents[0]).replace('\u202f','')) for td in tag_info if 'in database' in  str(td.get('title'))]
        tag_info = list(zip(['nodes','ways','relations'],tag_info))
    
    except Exception as e:
        tag_info = e

    return tag_info
    

def get_wikipedia_urls(url):
    
    temp_dict = {}
    soup = make_soup(url)
    wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
    
    # if the page does not contain a class=wikibase-sitelinklistview we ignore it
    if wikibase:
        for li in wikibase.find_all('li'):
            link = li.find('a')['href']
            lang = li.findAll('span')[3]['lang']
            temp_dict[lang]=link
    else:
        temp_dict = None
    
    return temp_dict

def scrub_text(text):
    # Drop footnote superscripts in brackets
    text = re.sub(r'\[.*?\]+', '', text)
    # Replace all non word charcters with a white space
    text = re.sub('\W', ' ',text)
    # trim all white spaces >2 to 1 
    text = re.sub('\s{2,}',' ',text)
    
    return text.lower()


def plain_text_from_wiki(url):
    soup = make_soup(url)
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.text
    plain_text = scrub_text(text)
    return plain_text

In [6]:
kv_df = generated_spatial_features_dataframe('./osm_groups.txt')

In [73]:
kv_df['wikidata_url'] = kv_df['osm_wiki_url'].apply(get_wikidata)

In [79]:
kv_df['tag_counts'] = kv_df['osm_wiki_url'].apply(get_tag_info)    

In [113]:
pd.set_option('display.max_colwidth', None)
for row in list(kv_df[~kv_df['wikidata_url'].str.contains('https://www.wikidata.org/wiki/',na=False)].iterrows())[85:]:
    print(row[1])
pd.set_option('display.max_colwidth', 50)

key                                                                                                                                                                                                                                                                                                                                                                            railway
value                                                                                                                                                                                                                                                                                                                                                                        abandoned
osm_wiki_url                                                                                                                                                                                                                                              

In [121]:
kv_df[kv_df['key']=='emergency']

Unnamed: 0,key,value,osm_wiki_url,wikidata_url,tag_counts


In [None]:
# there exist some edge cases where a wikidata_url should be added
# but because of the osm wiki structure it is not (for example building:yes)
# so we load the edge cas list and add them to the dataframe


# if edge cases can not resolve a wikidata url for a given key,value pair
# the wikidata url of Key should be taken

In [None]:
#some wikidata_urls contain an except these need to be filtered out
kv_df = kv_df.loc[kv_df['wikidata_url'].astype(str).str.contains('http')]
kv_df['wikipedia_dict'] = kv_df['wikidata_url'].apply(get_wikipedia_urls)
kv_df

In [None]:
# now we just select the english 'en' wikipedia entries and drop those rows that don't have an english wikipedia entry
kv_df_just_eng = pd.concat([kv_df.drop(['wikipedia_dict'], axis=1), kv_df['wikipedia_dict'].apply(pd.Series)], axis=1)[['key','value','en']].dropna()

In [None]:
# lastly we fetch the text
kv_df_just_eng['en_text'] = kv_df_just_eng['en'].apply(plain_text_from_wiki)

In [None]:
# and save the dataframe
kv_df_just_eng.to_pickle('./kv_df_just_eng.pickle')

In [None]:
kv_df_just_eng

In [None]:
response = requests.get(small_df.loc[3]['wikidata_url'])
print(response.text)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.findAll("div", {"class": "stylelistrow"})
wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
for li in wikibase.find_all('li'):
    print(li.findAll('span')[3]['lang'])
#     link = li.find('a')['href']
#     lang = li.findAll('span')[1]['title']
    
    