# 3. Correlation between geographic distance and semantic distance

**Authors**

| Author      | Affiliation            |
|-------------|------------------------|
| Rémy Decoupes    | INRAE / TETIS      |
| Mathieu Roche  | CIRAD / TETIS |
| Maguelonne Teisseire | INRAE / TETIS            |

![TETIS](https://www.umr-tetis.fr/images/logo-header-tetis.png)

In [None]:
!pip install transformers==4.37.2

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch

**Initiate API Key**

- HuggingFace 
- OpenAI

In [None]:
import getpass
 
HF_API_TOKEN = getpass.getpass(prompt="Your huggingFace API Key")
OPENAI_API_KEY = getpass.getpass(prompt="Your OpenAI API Key")

**Geo Datasets**

In [None]:
!pip install countryinfo
!pip install shapely
!pip install geopandas
!pip install matplotlib
!pip install scikit-learn
!pip install geopy
!pip install plotly-express
!pip install --upgrade nbformat
!pip install unidecode

In [None]:
from countryinfo import CountryInfo
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import geopandas as gpd

country = CountryInfo()

countries = []
capitals = []
regions = []
subregions = []
coordinates = []

for c in list(country.all().keys()):
    country_info = CountryInfo(c)
    countries.append(c)
    try:
        regions.append(country_info.region())
    except:
        regions.append(np.NAN)
    try:
        subregions.append(country_info.subregion())
    except:
        subregions.append(np.NAN)
    try:
        if country_info.geo_json()["features"][0]["geometry"]["type"] == "Polygon":
          coordinates.append(Polygon(country_info.geo_json()["features"][0]["geometry"]["coordinates"][0]))
        else: #MultiPolygon : Take the biggest one
          polygons = country_info.geo_json()["features"][0]["geometry"]["coordinates"]
          max_polygon = max(polygons, key=lambda x: len(x[0]))
          coordinates.append(Polygon(max_polygon[0]))
    except:
        coordinates.append(np.NAN)
    try:
        capitals.append(country_info.capital())
    except:
        capitals.append(np.NAN)

# Create DataFrame
data = {
    'Country': countries,
    'Capital': capitals,
    'Region': regions,
    'Subregion': subregions,
    'Coordinates': coordinates
}

df_countries = pd.DataFrame(data)
df_countries = gpd.GeoDataFrame(df_countries, geometry='Coordinates')

**add Captials coordinates**

With OpenStreetMap data through Nominatim geocoders

In [None]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point

geolocator = Nominatim(user_agent="geoBias-llm")
location = geolocator.geocode("Taipei", language='en')

print(f"lat: {location.latitude}, lon: {location.longitude}")

def capital_coord(city):
    loc = geolocator.geocode(city, language='en')
    try:
        point = Point(loc.longitude, loc.latitude)
    except:
        point = np.nan
    return point

df_countries["capital_coordinates"] = df_countries["Capital"].apply(capital_coord)

# Change the geometry
df_countries = gpd.GeoDataFrame(df_countries, geometry="capital_coordinates")

In [None]:
df_countries

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax =  world.plot(color='lightgrey')

df_countries.plot(ax=ax, color="red")

## 3.1 SLMs

### 3.1.1 Example

Let's compute the correlation between Taipei and other cities

In [None]:
city1 = "Taipei"
city2 = "Seoul"
city3 = "Hanoi"
city4 = "Tokyo"
city5 = "Singapour"
city6 = "London"

Retrieve **word embedding** from city names

In [None]:
from transformers import RobertaTokenizer, RobertaModel

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

In [None]:
def word_embedding(input_text):
    try:
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        with torch.no_grad():
            last_hidden_states = model(input_ids).last_hidden_state
        return last_hidden_states.mean(dim=1)[0] # for words chunked into subtokens (out of model vocabulary) and [CLS] & [SEP]
    except:
        return np.nan

emb1 = word_embedding(city1)
emb2 = word_embedding(city2)
emb3 = word_embedding(city3)
emb4 = word_embedding(city4)
emb5 = word_embedding(city5)
emb6 = word_embedding(city6)

print(f"Embedding length: {emb1.shape} \n\t{emb1}")

Compute **semantic similarity** between the cities' embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(f"Similarity between {city1} and {city2}: {cosine_similarity([emb1], [emb2])}")
print(f"Similarity between {city1} and {city3}: {cosine_similarity([emb1], [emb3])}")
print(f"Similarity between {city1} and {city4}: {cosine_similarity([emb1], [emb4])}")
print(f"Similarity between {city1} and {city5}: {cosine_similarity([emb1], [emb5])}")
print(f"Similarity between {city1} and {city6}: {cosine_similarity([emb1], [emb6])}")

Compute **geodistance** between cities

In [None]:
from geopy.distance import geodesic

geo_coord_1 = capital_coord(city1)
geo_coord_2 = capital_coord(city2)
geo_coord_3 = capital_coord(city3)
geo_coord_4 = capital_coord(city4)
geo_coord_5 = capital_coord(city5)
geo_coord_6 = capital_coord(city6)

# distance = geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_2.xy[1][0], geo_coord_2.xy[0][0])).kilometers

print(f"Distance between {city1} and {city2}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_2.xy[1][0], geo_coord_2.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city3}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_3.xy[1][0], geo_coord_3.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city4}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_4.xy[1][0], geo_coord_4.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city5}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_5.xy[1][0], geo_coord_5.xy[0][0])).kilometers} km")
print(f"Distance between {city1} and {city6}: {geodesic((geo_coord_1.xy[1][0], geo_coord_1.xy[0][0]), (geo_coord_6.xy[1][0], geo_coord_6.xy[0][0])).kilometers} km")


### 3.1.2 Worldwide

Build 2 matrices between pairs of Capitals:
- Semantic distance (1 - cosine similarity)
- Geo distance

In [None]:
# drop row for which we could not find geo coordinates
df_countries = df_countries[df_countries["capital_coordinates"].notna()]
df_countries["capital_coordinates"]


def tensor_to_array(embedding):
    try:
        return embedding.numpy()
    except:
        return np.nan

df_countries["capital_embedding_tensor"] = df_countries["Capital"].apply(word_embedding)
df_countries["capital_embedding"] = df_countries["capital_embedding_tensor"].apply(tensor_to_array)
df_countries = df_countries.dropna(subset=["capital_embedding"])

embedding_array = np.stack(df_countries["capital_embedding"].values)
semantic_distance_matrix = 1 - cosine_similarity(embedding_array, embedding_array)

In [None]:
def compute_geo_distance(df):
    coordinates = df["capital_coordinates"].tolist()
    num_city = len(coordinates)

    # Create an empty distance matrix
    distance_matrix = np.zeros((num_city, num_city))

    # Calculate distances and populate the distance matrix
    for i in range(num_city):
        for j in range(i + 1, num_city):
            coord1 = (coordinates[i].xy[1][0], coordinates[i].xy[0][0])
            coord2 = (coordinates[j].xy[1][0], coordinates[j].xy[0][0])
            distance = geodesic(coord1, coord2).kilometers
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance  # Since the distance matrix is symmetric
    return distance_matrix

geo_distance_matrix = compute_geo_distance(df_countries)


In [None]:
from unidecode import unidecode
import plotly.express as px
from scipy.stats import linregress

def label_for_plotting(df):
    capital = df["Capital"].tolist()
    num_capital = len(capital)

    # Create an empty distance matrix
    label_matrix = np.chararray((num_capital, num_capital), itemsize=30, unicode=True)

    # Calculate distances and populate the distance matrix
    for i in range(num_capital):
        for j in range(i + 1, num_capital):
            coord1 = unidecode(capital[i])
            coord2 = unidecode(capital[j])
            try:
                label_matrix[i, j] = f"{coord1} - {coord2}"
                label_matrix[j, i] = f"{coord1} - {coord2}"
            except:
                print(f"{coord1} - {coord2}")
    return label_matrix

labels_hover = label_for_plotting(df_countries)

def plot_scatter(geo_distance_matrix, semantic_distance_matrix, labels_hover, title):

    df = pd.DataFrame({
        "Geo Distance": geo_distance_matrix.flatten(),
        "Semantic Distance": semantic_distance_matrix.flatten(),
        "labels": labels_hover.flatten()
    })
    df["labels"].astype('str')

    # Perform linear regression
    slope, intercept, r_value, p_value, std_err = linregress(df["Geo Distance"], df["Semantic Distance"])
    line = slope * df["Geo Distance"] + intercept

    # Plot with Plotly Express
    fig = px.scatter(df, x="Geo Distance", y="Semantic Distance", title=title,
                        trendline="ols", trendline_color_override="red",
                        labels={"Geo Distance": "Geo Distance", "Semantic Distance": "Semantic Distance"},
                        hover_name="labels"
                        )

    # Add R-squared value to the layout
    fig.update_layout(annotations=[
        dict(
            x=0.05,
            y=0.95,
            xref="paper",
            yref="paper",
            text=f'R2 = {r_value**2:.2f}',
            showarrow=False,
            font=dict(size=12),
            bgcolor="rgba(255, 255, 255, 0.6)"
        )
    ])
    fig.show()

plot_scatter(geo_distance_matrix, semantic_distance_matrix, labels_hover, "World")

In [None]:
for region in df_countries["Region"].unique():
    print(region)
    df = df_countries[df_countries["Region"] == region]
    embedding_array = np.stack(df["capital_embedding"].values)
    semantic_distance_matrix = 1 - cosine_similarity(embedding_array, embedding_array)
    geo_distance_matrix = compute_geo_distance(df)
    labels_hover = label_for_plotting(df)
    plot_scatter(geo_distance_matrix, semantic_distance_matrix, labels_hover,region)

## 3.2 Local LLMs

### 3.2.1 Example

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", token=HF_API_TOKEN)
model = AutoModel.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", token=HF_API_TOKEN)

In [None]:
def word_embedding(input_text):
    try:
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        with torch.no_grad():
            last_hidden_states = model(input_ids).last_hidden_state
        return last_hidden_states.mean(dim=1)[0] # for words chunked into subtokens (out of model vocabulary) and [CLS] & [SEP]
    except:
        return np.nan

emb1 = word_embedding(city1)
emb2 = word_embedding(city2)
emb3 = word_embedding(city3)
emb4 = word_embedding(city4)
emb5 = word_embedding(city5)
emb6 = word_embedding(city6)

print(f"Embedding length: {emb1.shape} \n\t{emb1}")

In [None]:
print(f"Similarity between {city1} and {city2}: {cosine_similarity([emb1], [emb2])}")
print(f"Similarity between {city1} and {city3}: {cosine_similarity([emb1], [emb3])}")
print(f"Similarity between {city1} and {city4}: {cosine_similarity([emb1], [emb4])}")
print(f"Similarity between {city1} and {city5}: {cosine_similarity([emb1], [emb5])}")
print(f"Similarity between {city1} and {city6}: {cosine_similarity([emb1], [emb6])}")

### 3.2.2 Worldwide

In [None]:
df_countries["capital_embedding_tensor"] = df_countries["Capital"].apply(word_embedding)
df_countries["capital_embedding"] = df_countries["capital_embedding_tensor"].apply(tensor_to_array)
df_countries = df_countries.dropna(subset=["capital_embedding"])

In [None]:
for region in df_countries["Region"].unique():
    print(region)
    df = df_countries[df_countries["Region"] == region]
    embedding_array = np.stack(df["capital_embedding"].values)
    semantic_distance_matrix = 1 - cosine_similarity(embedding_array, embedding_array)
    geo_distance_matrix = compute_geo_distance(df)
    labels_hover = label_for_plotting(df)
    plot_scatter(geo_distance_matrix, semantic_distance_matrix, labels_hover,region)

## 3.3  Remote LLMs

### 3.3.1 Example

In [None]:
!pip install langchain
!pip install openai==0.28
!pip install tiktoken

In [None]:
import openai
from langchain.embeddings import OpenAIEmbeddings

tok = 'cl100k_base',
model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [None]:
def word_embedding(input_text):
    return np.array(model.embed_documents([input_text])[0])


In [None]:
emb1 = word_embedding(city1)
emb2 = word_embedding(city2)
emb3 = word_embedding(city3)
emb4 = word_embedding(city4)
emb5 = word_embedding(city5)
emb6 = word_embedding(city6)

print(f"Embedding length: {emb1.shape} \n\t{emb1}")

In [None]:
print(f"Similarity between {city1} and {city2}: {cosine_similarity([emb1], [emb2])}")
print(f"Similarity between {city1} and {city3}: {cosine_similarity([emb1], [emb3])}")
print(f"Similarity between {city1} and {city4}: {cosine_similarity([emb1], [emb4])}")
print(f"Similarity between {city1} and {city5}: {cosine_similarity([emb1], [emb5])}")
print(f"Similarity between {city1} and {city6}: {cosine_similarity([emb1], [emb6])}")

### 3.3.2 Worldwide

In [None]:
for region in df_countries["Region"].unique():
    print(region)
    df = df_countries[df_countries["Region"] == region]
    embedding_array = np.stack(df["capital_embedding"].values)
    semantic_distance_matrix = 1 - cosine_similarity(embedding_array, embedding_array)
    geo_distance_matrix = compute_geo_distance(df)
    labels_hover = label_for_plotting(df)
    plot_scatter(geo_distance_matrix, semantic_distance_matrix, labels_hover,region)

## 3.4 *Going Further*: 

### 3.4.1 Using other LLMs

### 3.4.2 Build clusters of countries that are semantically close

Use K-Means (n=10 clusters) or Hierarchichal Clustering or DBSCAN to cluster countries 

A low correlation between geographical distance and semantic distance between location embeddings suggests that the semantic distance (captured by the embedding space) is not strongly related to the geographical distance between locations. This could mean that the semantic relationships are more influenced by cultural, historical, or sociological factors rather than geographical distance.

Clustering of countries may highlight cultural or historical relationships between countries.