In [None]:
import pyrosm
from fuzzywuzzy import fuzz
import numpy as np
import geopandas as gpd, pandas as pd
import json
from collections import Counter
fp = 'OSM-filepath'
osm = pyrosm.OSM(fp)
osm

In [None]:
pois = osm.get_pois()
buildings = osm.get_buildings()

In [None]:
wv = pd.read_csv('Wikivoyage-csv')
wv_annotated=pd.read_csv('Annotated-wikivoyage')
wv = wv[~wv[['longitude','latitude']].isna().any(axis=1)]
wv = gpd.GeoDataFrame(wv, geometry=gpd.points_from_xy(wv['longitude'], wv['latitude']))
wv = wv.set_crs('EPSG:4326')

In [None]:
named_buildings = buildings[buildings['name'].astype('bool')].copy()
named_pois = pois[pois['name'].astype('bool')].copy()
named_locs = named_buildings.append(named_pois)

For the joined dataframe, title and alt are from wikivoyage, name and tags are from osm

In [None]:
joined = wv.sjoin_nearest(named_locs)[['article', 'addr:city', 'type', 'title', 'alt','name', 'tags', 'description']]

In [None]:
joined

Collect frequent words, should be loaded in from a list

In [None]:
all_tokens=Counter()
all_listings_en=pd.read_csv('wikivoyage-listings-en')
all_listings_de=pd.read_csv('wikivoyage-listings-de')
all_listings_fr=pd.read_csv('wikivoyage-listings-fr')
all_listings_it=pd.read_csv('wikivoyage-listings-it')
list_of_names=all_listings_en['title'].dropna().tolist()
list_of_names=list_of_names+all_listings_de['title'].dropna().tolist()
list_of_names=list_of_names+all_listings_fr['title'].dropna().tolist()
list_of_names=list_of_names+all_listings_it['title'].dropna().tolist()
frequent_words=[]
for item in list_of_names:
    item=item.lower()
    all_tokens.update(item.split())
for word, count in all_tokens.items():
    if count > 50:
        frequent_words.append(word)
        
list_of_bbbike_names=joined['name'].tolist()
all_bbbike_tokens=Counter()
for item in list_of_bbbike_names:
    item=item.lower()
    all_bbbike_tokens.update(item.split())
for word, count in all_bbbike_tokens.items():
    if count > 3:
        frequent_words.append(word)

In [None]:
def remove_high_freq_words(text, high_freq):
    new=[]
    splitted=text.split()
    for word in splitted:
        if word.lower() not in high_freq:
            new.append(word.lower())
    if len(new)<1: #Check if the word does not only contain frequent words
        return(text)
    else:
        return " ".join(new)

In [None]:
def fuzzywuzzy_with_alts(df, fuzzy, frequent_words, simplified=False):
    list_of_best_scores=[]
    for index, row in df.iterrows():
        scores=[]
        if not pd.isna(row['alt']):
            all_titles=row['alt'].replace(',', ':')
            #all_titles=row['alt'].replace('\"', '')
            all_titles=row['alt'].split(':')
            all_titles.append(row['title'])
        else:
            all_titles=[row['title']]
        if row['tags']!=None:
            tag_dict=json.loads(row['tags'])
            all_names=[v for k,v in tag_dict.items() if 'name' in k]
            all_names.append(row['name'])
        else:
            all_names=[row['name']]
        if simplified:
            all_names=[remove_high_freq_words(name, frequent_words) for name in all_names]
            all_titles=[remove_high_freq_words(title, frequent_words) for title in all_titles]
        for title in all_titles:
            for name in all_names:
                if fuzzy=='set':
                    scores.append(fuzz.token_set_ratio(name,title))
                elif fuzzy=='sort':
                    scores.append(fuzz.token_sort_ratio(name,title))
                elif fuzzy=='partial':
                    scores.append(fuzz.partial_ratio(name,title))
                elif fuzzy=='ratio':
                    scores.append(fuzz.ratio(name,title))
        list_of_best_scores.append(max(scores))
    return list_of_best_scores               

In [None]:
joined['set'] = fuzzywuzzy_with_alts(joined, 'set', frequent_words, False)
joined['sort'] = fuzzywuzzy_with_alts(joined, 'sort', frequent_words, False)
joined['partial'] = fuzzywuzzy_with_alts(joined, 'partial', frequent_words, False)
joined['ratio'] =fuzzywuzzy_with_alts(joined, 'ratio', frequent_words, False)
joined['simplified_set'] = fuzzywuzzy_with_alts(joined, 'set', frequent_words, True)
joined['simplified_sort'] = fuzzywuzzy_with_alts(joined, 'sort', frequent_words, True)
joined['simplified_partial'] = fuzzywuzzy_with_alts(joined, 'partial', frequent_words, True)
joined['simplified_ratio'] =fuzzywuzzy_with_alts(joined, 'ratio', frequent_words, True)

In [None]:
joined

In [None]:
wv_annotated_small=wv_annotated[['title', 'name', 'gold']]
wv_annotated_small


In [None]:
joined_gold=joined
joined_gold['gold']=wv_annotated['gold']
joined_gold

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
set_precision=[]
set_recall=[]
set_fscore=[]
sort_precision=[]
sort_recall=[]
sort_fscore=[]
partial_precision=[]
partial_recall=[]
partial_fscore=[]
ratio_precision=[]
ratio_recall=[]
ratio_fscore=[]

simplified_set_precision=[]
simplified_set_recall=[]
simplified_set_fscore=[]
simplified_sort_precision=[]
simplified_sort_recall=[]
simplified_sort_fscore=[]
simplified_partial_precision=[]
simplified_partial_recall=[]
simplified_partial_fscore=[]
simplified_ratio_precision=[]
simplified_ratio_recall=[]
simplified_ratio_fscore=[]

for threshold in range(1,100):
    set_precision.append(precision_score(wv_annotated['gold'], joined_gold['set'].apply(lambda x: True if x > threshold else False)))
    set_recall.append(recall_score(wv_annotated['gold'], joined_gold['set'].apply(lambda x: True if x > threshold else False)))
    set_fscore.append(f1_score(wv_annotated['gold'], joined_gold['set'].apply(lambda x: True if x > threshold else False)))
    sort_precision.append(precision_score(wv_annotated['gold'], joined_gold['sort'].apply(lambda x: True if x > threshold else False)))
    sort_recall.append(recall_score(wv_annotated['gold'], joined_gold['sort'].apply(lambda x: True if x > threshold else False)))
    sort_fscore.append(f1_score(wv_annotated['gold'], joined_gold['sort'].apply(lambda x: True if x > threshold else False)))
    partial_precision.append(precision_score(wv_annotated['gold'], joined_gold['partial'].apply(lambda x: True if x > threshold else False)))
    partial_recall.append(recall_score(wv_annotated['gold'], joined_gold['partial'].apply(lambda x: True if x > threshold else False)))
    partial_fscore.append(f1_score(wv_annotated['gold'], joined_gold['partial'].apply(lambda x: True if x > threshold else False)))
    ratio_precision.append(precision_score(wv_annotated['gold'], joined_gold['ratio'].apply(lambda x: True if x > threshold else False)))
    ratio_recall.append(recall_score(wv_annotated['gold'], joined_gold['ratio'].apply(lambda x: True if x > threshold else False)))
    ratio_fscore.append(f1_score(wv_annotated['gold'], joined_gold['ratio'].apply(lambda x: True if x > threshold else False)))
    
    simplified_set_precision.append(precision_score(wv_annotated['gold'], joined_gold['simplified_set'].apply(lambda x: True if x > threshold else False)))
    simplified_set_recall.append(recall_score(wv_annotated['gold'], joined_gold['simplified_set'].apply(lambda x: True if x > threshold else False)))
    simplified_set_fscore.append(f1_score(wv_annotated['gold'], joined_gold['simplified_set'].apply(lambda x: True if x > threshold else False)))    
    simplified_sort_precision.append(precision_score(wv_annotated['gold'], joined_gold['simplified_sort'].apply(lambda x: True if x > threshold else False)))
    simplified_sort_recall.append(recall_score(wv_annotated['gold'], joined_gold['simplified_sort'].apply(lambda x: True if x > threshold else False)))
    simplified_sort_fscore.append(f1_score(wv_annotated['gold'], joined_gold['simplified_sort'].apply(lambda x: True if x > threshold else False)))
    simplified_partial_precision.append(precision_score(wv_annotated['gold'], joined_gold['simplified_partial'].apply(lambda x: True if x > threshold else False)))
    simplified_partial_recall.append(recall_score(wv_annotated['gold'], joined_gold['simplified_partial'].apply(lambda x: True if x > threshold else False)))
    simplified_partial_fscore.append(f1_score(wv_annotated['gold'], joined_gold['simplified_partial'].apply(lambda x: True if x > threshold else False)))
    simplified_ratio_precision.append(precision_score(wv_annotated['gold'], joined_gold['simplified_ratio'].apply(lambda x: True if x > threshold else False)))
    simplified_ratio_recall.append(recall_score(wv_annotated['gold'], joined_gold['simplified_ratio'].apply(lambda x: True if x > threshold else False)))
    simplified_ratio_fscore.append(f1_score(wv_annotated['gold'], joined_gold['simplified_ratio'].apply(lambda x: True if x > threshold else False)))

In [None]:
import matplotlib.pyplot as plt
  
X = range(1,100)
plt.plot(X, set_precision, label='set')
plt.plot(X, sort_precision, label='sort')
plt.plot(X, partial_precision, label='partial')
plt.plot(X, ratio_precision, label='ratio')

plt.plot(X, simplified_set_precision, label='simplified_set')
plt.plot(X, simplified_sort_precision, label='simplified_sort')
plt.plot(X, simplified_partial_precision, label='simplified_partial')
plt.plot(X, simplified_ratio_precision, label='simplified_ratio')

plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.title("Precision scores for thresholds")
  
plt.legend()
plt.show()

In [None]:
plt.plot(X, set_recall, label='set')
plt.plot(X, sort_recall, label='sort')
plt.plot(X, partial_recall, label='partial')
plt.plot(X, ratio_recall, label='ratio')

plt.plot(X, simplified_set_recall, label='simplified_set')
plt.plot(X, simplified_sort_recall, label='simplified_sort')
plt.plot(X, simplified_partial_recall, label='simplified_partial')
plt.plot(X, simplified_ratio_recall, label='simplified_ratio')
plt.xlabel("Threshold")
plt.ylabel("Recall")
plt.title("Recall scores for thresholds")
  
plt.legend()
plt.show()

In [None]:
plt.plot(X, set_fscore, label='set')
plt.plot(X, sort_fscore, label='sort')
plt.plot(X, partial_fscore, label='partial')
plt.plot(X, ratio_fscore, label='ratio')

plt.plot(X, simplified_set_fscore, label='simplified_set')
plt.plot(X, simplified_sort_fscore, label='simplified_sort')
plt.plot(X, simplified_partial_fscore, label='simplified_partial')
plt.plot(X, simplified_ratio_fscore, label='simplified_ratio')

plt.xlabel("Threshold")
plt.ylabel("F1-score")
plt.title("F1-scores for thresholds")
  
plt.legend()
plt.show()

In [None]:
print(set_fscore.index(max(set_fscore))+1,max(set_fscore))
print(sort_fscore.index(max(sort_fscore))+1,max(sort_fscore))
print(partial_fscore.index(max(partial_fscore))+1,max(partial_fscore))
print(ratio_fscore.index(max(ratio_fscore))+1,max(ratio_fscore))
print(simplified_set_fscore.index(max(simplified_set_fscore))+1,max(simplified_set_fscore))
print(simplified_sort_fscore.index(max(simplified_sort_fscore))+1,max(simplified_sort_fscore))
print(simplified_partial_fscore.index(max(simplified_partial_fscore))+1,max(simplified_partial_fscore))
print(simplified_ratio_fscore.index(max(simplified_ratio_fscore))+1,max(simplified_ratio_fscore))

In [None]:
print(partial_fscore)

In [None]:
print(simplified_ratio_fscore.index(max(simplified_ratio_fscore))+1>max(simplified_ratio_fscore))