# 3. Correlation between geographic distance and semantic distance

**Authors**

| Author      | Affiliation            |
|-------------|------------------------|
| Rémy Decoupes    | INRAE / TETIS      |
| Mathieu Roche  | CIRAD / TETIS |
| Maguelonne Teisseire | INRAE / TETIS            |

![TETIS](https://www.umr-tetis.fr/images/logo-header-tetis.png)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

**Geo Datasets**

In [None]:
!pip install countryinfo
!pip install shapely
!pip install geopandas
!pip install matplotlib
!pip install scikit-learn
!pip install geopy

In [None]:
from countryinfo import CountryInfo
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import geopandas as gpd

country = CountryInfo()

countries = []
capitals = []
regions = []
subregions = []
coordinates = []

for c in list(country.all().keys()):
    country_info = CountryInfo(c)
    countries.append(c)
    try:
        regions.append(country_info.region())
    except:
        regions.append(np.NAN)
    try:
        subregions.append(country_info.subregion())
    except:
        subregions.append(np.NAN)
    try:
        if country_info.geo_json()["features"][0]["geometry"]["type"] == "Polygon":
          coordinates.append(Polygon(country_info.geo_json()["features"][0]["geometry"]["coordinates"][0]))
        else: #MultiPolygon : Take the biggest one
          polygons = country_info.geo_json()["features"][0]["geometry"]["coordinates"]
          max_polygon = max(polygons, key=lambda x: len(x[0]))
          coordinates.append(Polygon(max_polygon[0]))
    except:
        coordinates.append(np.NAN)
    try:
        capitals.append(country_info.capital())
    except:
        capitals.append(np.NAN)

# Create DataFrame
data = {
    'Country': countries,
    'Capital': capitals,
    'Region': regions,
    'Subregion': subregions,
    'Coordinates': coordinates
}

df_countries = pd.DataFrame(data)
df_countries = gpd.GeoDataFrame(df_countries, geometry='Coordinates')

**add Captials coordinates**

With OpenStreetMap data through Nominatim geocoders

In [None]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point

geolocator = Nominatim(user_agent="geoBias-llm")
location = geolocator.geocode("Taipei", language='en')

print(f"lat: {location.latitude}, lon: {location.longitude}")

def capital_coord(city):
    loc = geolocator.geocode(city, language='en')
    try:
        point = Point(loc.longitude, loc.latitude)
    except:
        point = np.nan
    return point

df_countries["capital_coordinates"] = df_countries["Capital"].apply(capital_coord)

# Change the geometry
df_countries = gpd.GeoDataFrame(df_countries, geometry="capital_coordinates")

In [None]:
df_countries

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax =  world.plot(color='lightgrey')

df_countries.plot(ax=ax, color="red")

## 3.1 SLMs

### 3.1.1 Example

Let's compute the correlation between Taepei and other cities

In [None]:
city1 = "Taipei"
city2 = "Seoul"
city3 = "Hanoi"
city4 = "Tokyo"
city5 = "Singapour"
city6 = "London"

Retrieve **word embedding** from city names

In [None]:
from transformers import RobertaTokenizer, RobertaModel

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

In [None]:
def word_embedding(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    return last_hidden_states.mean(dim=1)[0] # for words chunked into subtokens (out of model vocabulary) and [CLS] & [SEP]

emb1 = word_embedding(city1)
emb2 = word_embedding(city2)
emb3 = word_embedding(city3)
emb4 = word_embedding(city4)
emb5 = word_embedding(city5)
emb6 = word_embedding(city6)

print(f"Embedding length: {emb1.shape} \n\t{emb1}")

Compute **semantic similarity** between the cities' embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(f"Similarity between {city1} and {city2}: {cosine_similarity([emb1], [emb2])}")
print(f"Similarity between {city1} and {city3}: {cosine_similarity([emb1], [emb3])}")
print(f"Similarity between {city1} and {city4}: {cosine_similarity([emb1], [emb4])}")
print(f"Similarity between {city1} and {city5}: {cosine_similarity([emb1], [emb5])}")
print(f"Similarity between {city1} and {city6}: {cosine_similarity([emb1], [emb6])}")

Compute **geodistance** between cities

In [None]:
from geopy.distance import geodesic

geo_coord_1 = capital_coord(city1)
geo_coord_2 = capital_coord(city2)
geo_coord_3 = capital_coord(city3)
geo_coord_4 = capital_coord(city4)
geo_coord_5 = capital_coord(city5)
geo_coord_6 = capital_coord(city6)

# distance = geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_2.xy[1][0], geo_coord_2.xy[0][0])).kilometers

print(f"Distance between {city1} and {city2}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_2.xy[1][0], geo_coord_2.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city3}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_3.xy[1][0], geo_coord_3.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city4}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_4.xy[1][0], geo_coord_4.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city5}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_5.xy[1][0], geo_coord_5.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city6}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_6.xy[1][0], geo_coord_6.xy[0][0])).kilometers} km")
