In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [43]:
def generated_spatial_features_dataframe(path,limit=999999):
    key_value_link_list = []
    with open(path,'r') as f:
        urls = f.read()
        urls = urls.split('\n')
    
    for url in urls[:limit]:
        key_value_link_list = key_value_link_list + fetch_group(url)
    
    return pd.DataFrame(key_value_link_list,columns=['key','value','osm_wiki_url'])
    

def make_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    return soup

def fetch_group(url):
    soup = make_soup(url)
    tmp_list = []

    for tbody in soup.find_all('tbody'):
        for tr in tbody.find_all('tr'):
            for td in tr.find_all('td'):
                found_key = None
                try:
                    title = td.a['title']
                    if 'Tag:' in title:
                        key,value = title[4:].split('=')
                        tmp_list.append((key,value,f"https://wiki.openstreetmap.org{td.a['href']}"))
                except:
                    pass
    
    return tmp_list

def get_wikidata(url):
    wikidata_url = None
    
    try:
        soup = make_soup(url)
        wikidata_url = soup.find('tr', {'class' : 'd_wikidata content'}).a['href']
    
    except Exception as e:
        wikidata_url = e
    
    return wikidata_url

def get_wikipedia_urls(url):
    
    temp_dict = {}
    soup = make_soup(url)
    wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
    
    # if the page does not contain a class=wikibase-sitelinklistview we ignore it
    if wikibase:
        for li in wikibase.find_all('li'):
            link = li.find('a')['href']
            lang = li.findAll('span')[3]['lang']
            temp_dict[lang]=link
    else:
        temp_dict = None
    
    return temp_dict

def scrub_text(text):
    # Drop footnote superscripts in brackets
    text = re.sub(r'\[.*?\]+', '', text)
    # Replace all non word charcters with a white space
    text = re.sub('\W', ' ',text)
    # trim all white spaces >2 to 1 
    text = re.sub('\s{2,}',' ',text)
    
    return text.lower()


def plain_text_from_wiki(url):
    soup = make_soup(url)
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.text
    plain_text = scrub_text(text)
    return plain_text



In [17]:
kv_df = generated_spatial_features_dataframe('./osm_groups.txt')

In [30]:
kv_df['wikidata_url'] = kv_df['osm_wiki_url'].apply(get_wikidata)

In [42]:
#some wikidata_urls contain an except these need to be filtered out
kv_df = kv_df.loc[kv_df['wikidata_url'].astype(str).str.contains('http')]
kv_df['wikipedia_dict'] = kv_df['wikidata_url'].apply(get_wikipedia_urls)
kv_df

https://www.wikidata.org/wiki/Q187456
https://www.wikidata.org/wiki/Q853185
https://www.wikidata.org/wiki/Q857909
https://www.wikidata.org/wiki/Q30022
https://query.wikidata.org/#SELECT%20%3Fitem%20%3FitemLabel%20%3Ftag%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP1282%20%3Ftag%20.%0A%20%20FILTER%28%3Ftag%3D%27%27%27Tag%3Aamenity%3Ddrinking_water%27%27%27%29%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%0A%20%20%20%20bd%3AserviceParam%20wikibase%3Alanguage%20%22%5BAUTO_LANGUAGE%5D%2Cen%22%0A%20%20%7D%0A%7D%20LIMIT%20200
https://www.wikidata.org/wiki/Q1751429
https://www.wikidata.org/wiki/Q1192284
https://query.wikidata.org/#SELECT%20%3Fitem%20%3FitemLabel%20%3Ftag%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP1282%20%3Ftag%20.%0A%20%20FILTER%28%3Ftag%3D%27%27%27Tag%3Aamenity%3Dice_cream%27%27%27%29%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%0A%20%20%20%20bd%3AserviceParam%20wikibase%3Alanguage%20%22%5BAUTO_LANGUAGE%5D%2Cen%22%0A%20%20%7D%0A%7D%20LIMIT%20200
https://query.wikidata.org/#SELECT%20%3Fitem%20%

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kv_df['wikipedia_dict'] = kv_df['wikidata_url'].apply(get_wikipedia_urls)


Unnamed: 0,key,value,osm_wiki_url,wikidata_url,wikipedia_dict
0,amenity,bar,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q187456,{'ar': 'https://ar.wikipedia.org/wiki/%D8%A8%D...
1,amenity,bbq,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q853185,{'bg': 'https://bg.wikipedia.org/wiki/%D0%A1%D...
2,amenity,biergarten,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q857909,{'de': 'https://de.wikipedia.org/wiki/Biergart...
3,amenity,cafe,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://www.wikidata.org/wiki/Q30022,{'af': 'https://af.wikipedia.org/wiki/Koffiewi...
4,amenity,drinking water,https://wiki.openstreetmap.org/wiki/Tag:amenit...,https://query.wikidata.org/#SELECT%20%3Fitem%2...,
...,...,...,...,...,...
467,building,container,https://wiki.openstreetmap.org/wiki/Tag:buildi...,https://www.wikidata.org/wiki/Q1128503,{'ar': 'https://ar.wikipedia.org/wiki/%D8%AD%D...
468,building,gatehouse,https://wiki.openstreetmap.org/wiki/Tag:buildi...,https://www.wikidata.org/wiki/Q277760,{'ca': 'https://ca.wikipedia.org/wiki/Porta_de...
469,building,roof,https://wiki.openstreetmap.org/wiki/Tag:buildi...,https://www.wikidata.org/wiki/Q47525110,{}
470,building,ruins,https://wiki.openstreetmap.org/wiki/Tag:buildi...,https://www.wikidata.org/wiki/Q109607,{'af': 'https://af.wikipedia.org/wiki/Ru%C3%AF...


In [44]:
# now we just select the english 'en' wikipedia entries and drop those rows that don't have an english wikipedia entry
kv_df_just_eng = pd.concat([kv_df.drop(['wikipedia_dict'], axis=1), kv_df['wikipedia_dict'].apply(pd.Series)], axis=1)[['key','value','en']].dropna()

In [45]:
# lastly we fetch the text
kv_df_just_eng['en_text'] = kv_df_just_eng['en'].apply(plain_text_from_wiki)

In [46]:
# and save the dataframe
kv_df_just_eng.to_pickle('./kv_df_just_eng.pickle')

In [48]:
kv_df_just_eng

Unnamed: 0,key,value,en,en_text
0,amenity,bar,https://en.wikipedia.org/wiki/Bar,a bar is a long raised narrow table or bench d...
1,amenity,bbq,https://en.wikipedia.org/wiki/Barbecue_grill,a barbecue grill is a device that cooks food b...
2,amenity,biergarten,https://en.wikipedia.org/wiki/Beer_garden,a beer garden a loan translation from the ger...
3,amenity,cafe,https://en.wikipedia.org/wiki/Coffeehouse,a coffeehouse coffee shop or café is an estab...
5,amenity,fast food,https://en.wikipedia.org/wiki/Fast_food_restau...,a fast food restaurant also known as a quick ...
...,...,...,...,...
463,building,water tower,https://en.wikipedia.org/wiki/Water_tower,a water tower is an elevated building supporti...
464,building,bunker,https://en.wikipedia.org/wiki/Bunker,a bunker is a defensive military fortificatio...
467,building,container,https://en.wikipedia.org/wiki/Shipping_contain...,shipping container architecture is a form of a...
468,building,gatehouse,https://en.wikipedia.org/wiki/Gatehouse,a gatehouse is an entry control point building...


In [None]:
response = requests.get(small_df.loc[3]['wikidata_url'])
print(response.text)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.findAll("div", {"class": "stylelistrow"})
wikibase = soup.find("div",{'class','wikibase-sitelinklistview'})
for li in wikibase.find_all('li'):
    print(li.findAll('span')[3]['lang'])
#     link = li.find('a')['href']
#     lang = li.findAll('span')[1]['title']
    
    