In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [14]:
def generated_spatial_features_dataframe(path,limit=999999):
    key_value_link_list = []
    with open(path,'r') as f:
        urls = f.read()
        urls = urls.split('\n')
    
    for url in urls[:limit]:
        key_value_link_list = key_value_link_list + fetch_group(url)
    
    return pd.DataFrame(key_value_link_list,columns=['key','value','osm_wiki_url'])
    

def make_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    return soup

def fetch_group(url):
    soup = make_soup(url)
    tmp_list = []

    for tbody in soup.find_all('tbody'):
        for tr in tbody.find_all('tr'):
            for td in tr.find_all('td'):
                found_key = None
                try:
                    title = td.a['title']
                    if 'Tag:' in title:
                        key,value = title[4:].split('=')
                        tmp_list.append((key,value,f"https://wiki.openstreetmap.org{td.a['href']}"))
                except:
                    pass
    
    return tmp_list

def get_wikidata(url):
    wikidata_url = None
    
    try:
        soup = make_soup(url)
        wikidata_url = soup.find('tr', {'class' : 'd_wikidata content'}).a['href']
    
    except Exception as e:
        wikidata_url = e
    
    return wikidata_url

def get_wikipedia_urls(url):
    temp_dict = {}
    soup = make_soup(url)
    wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
    
    # if the page does not contain a class=wikibase-sitelinklistview we ignore it
    if wikibase:
        for li in wikibase.find_all('li'):
            link = li.find('a')['href']
            lang = li.findAll('span')[3]['lang']
            temp_dict[lang]=link
    else:
        temp_dict = None
    
    return temp_dict

def scrub_text(text):
    # Drop footnote superscripts in brackets
    text = re.sub(r'\[.*?\]+', '', text)
    # Replace all non word charcters with a white space
    text = re.sub('\W', ' ',text)
    # trim all white spaces >2 to 1 
    text = re.sub('\s{2,}',' ',text)
    
    return text.lower()


def plain_text_from_wiki(url):
    soup = make_soup(url)
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.text
    plain_text = scrub_text(text)
    return plain_text



In [3]:
kv_df = generated_spatial_features_dataframe('./osm_groups.txt',3)

In [4]:
small_df = kv_df.loc[:5].copy()
small_df

Unnamed: 0,key,value,osm_wiki_url
0,amenity,bar,https://wiki.openstreetmap.org/wiki/Tag:amenit...
1,amenity,bbq,https://wiki.openstreetmap.org/wiki/Tag:amenit...
2,amenity,biergarten,https://wiki.openstreetmap.org/wiki/Tag:amenit...
3,amenity,cafe,https://wiki.openstreetmap.org/wiki/Tag:amenit...
4,amenity,drinking water,https://wiki.openstreetmap.org/wiki/Tag:amenit...
5,amenity,fast food,https://wiki.openstreetmap.org/wiki/Tag:amenit...


In [5]:
small_df['wikidata_url'] = small_df['osm_wiki_url'].apply(get_wikidata)
small_df['wikipedia_dict'] = small_df['wikidata_url'].apply(get_wikipedia_urls)
small_df

Unnamed: 0,key,value,osm_wiki_url,wikidata_url,wikipedia_dict
0,amenity,bar,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q187456,{'ar': 'https://ar.wikipedia.org/wiki/%D8%A8%D...
1,amenity,bbq,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q853185,{'bg': 'https://bg.wikipedia.org/wiki/%D0%A1%D...
2,amenity,biergarten,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q857909,{'de': 'https://de.wikipedia.org/wiki/Biergart...
3,amenity,cafe,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q30022,{'af': 'https://af.wikipedia.org/wiki/Koffiewi...
4,amenity,drinking water,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://query.wikidata.org/#SELECT%20%3Fitem%2...,
5,amenity,fast food,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q1751429,{'ar': 'https://ar.wikipedia.org/wiki/%D9%85%D...


In [6]:
small_df_just_eng = pd.concat([small_df.drop(['wikipedia_dict'], axis=1), small_df['wikipedia_dict'].apply(pd.Series)], axis=1)[['key','value','en']].dropna()

In [15]:
small_df_just_eng['en_text'] = small_df_just_eng['en'].apply(plain_text_from_wiki)

In [16]:
small_df_just_eng

Unnamed: 0,key,value,en,en_text
0,amenity,bar,https://en.wikipedia.org/wiki/Bar,a bar is a long raised narrow table or bench d...
1,amenity,bbq,https://en.wikipedia.org/wiki/Barbecue_grill,a barbecue grill is a device that cooks food b...
2,amenity,biergarten,https://en.wikipedia.org/wiki/Beer_garden,a beer garden a loan translation from the ger...
3,amenity,cafe,https://en.wikipedia.org/wiki/Coffeehouse,a coffeehouse coffee shop or café is an estab...
5,amenity,fast food,https://en.wikipedia.org/wiki/Fast_food_restau...,a fast food restaurant also known as a quick ...


In [None]:
response = requests.get(small_df.loc[3]['wikidata_url'])
print(response.text)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.findAll("div", {"class": "stylelistrow"})
wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
for li in wikibase.find_all('li'):
    print(li.findAll('span')[3]['lang'])
#     link = li.find('a')['href']
#     lang = li.findAll('span')[1]['title']
    
    

In [None]:
print(data)

In [None]:
small_df.loc[0]['osm_wiki_url']

In [None]:
print(response.text)

In [None]:
# soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
soup.find('tr', {'class' : 'd_wikidata content'}).a['href']


In [None]:
key_value_link = []

for tbody in soup.find_all('tbody'):
    for tr in tbody.find_all('tr'):
        for td in tr.find_all('td'):
            found_key = None
            try:
                title = td.a['title']
                if 'Tag:' in title:
                    key,value = title[4:].split('=')
                    key_value_link.append((key,value,f"https://wiki.openstreetmap.org{td.a['href']}"))
            except:
                pass
key_value_link

In [None]:
for tr in soup.find_all('tbody')[2].find_all('tr'):
    print(tr)