# 4. Anomaly between geographical distance and semantic distance

**Authors**

| Author      | Affiliation            |
|-------------|------------------------|
| Rémy Decoupes    | INRAE / TETIS      |
| Mathieu Roche  | CIRAD / TETIS |
| Maguelonne Teisseire | INRAE / TETIS            |

![TETIS](https://www.umr-tetis.fr/images/logo-header-tetis.png)

In [None]:
from countryinfo import CountryInfo
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import geopandas as gpd

country = CountryInfo()

countries = []
capitals = []
regions = []
subregions = []
coordinates = []

for c in list(country.all().keys()):
    country_info = CountryInfo(c)
    countries.append(c)
    try:
        regions.append(country_info.region())
    except:
        regions.append(np.NAN)
    try:
        subregions.append(country_info.subregion())
    except:
        subregions.append(np.NAN)
    try:
        if country_info.geo_json()["features"][0]["geometry"]["type"] == "Polygon":
          coordinates.append(Polygon(country_info.geo_json()["features"][0]["geometry"]["coordinates"][0]))
        else: #MultiPolygon : Take the biggest one
          polygons = country_info.geo_json()["features"][0]["geometry"]["coordinates"]
          max_polygon = max(polygons, key=lambda x: len(x[0]))
          coordinates.append(Polygon(max_polygon[0]))
    except:
        coordinates.append(np.NAN)
    try:
        capitals.append(country_info.capital())
    except:
        capitals.append(np.NAN)

# Create DataFrame
data = {
    'Country': countries,
    'Capital': capitals,
    'Region': regions,
    'Subregion': subregions,
    'Coordinates': coordinates
}

df_countries = pd.DataFrame(data)
df_countries = gpd.GeoDataFrame(df_countries, geometry='Coordinates')


## 4.1 SLMs and Local LLMs

This indicator aims to identify which Capitals are the most far away of the others in the semantic space

compute **semantic matrix**

In [None]:
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

def word_embedding(input_text):
    try:
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        with torch.no_grad():
            last_hidden_states = model(input_ids).last_hidden_state
        return last_hidden_states.mean(dim=1)[0] # for words chunked into subtokens (out of model vocabulary) and [CLS] & [SEP]
    except:
        return np.nan
    
def tensor_to_array(embedding):
    try:
        return embedding.numpy()
    except:
        return np.nan


In [None]:
df_countries["capital_embedding_tensor"] = df_countries["Capital"].apply(word_embedding)
df_countries["capital_embedding"] = df_countries["capital_embedding_tensor"].apply(tensor_to_array)
df_countries = df_countries.dropna(subset=["capital_embedding"])

embedding_array = np.stack(df_countries["capital_embedding"].values)
semantic_distance_matrix = 1 - cosine_similarity(embedding_array, embedding_array)

In [None]:
df_results_mean_semantique = pd.DataFrame(semantic_distance_matrix, columns=df_countries["Capital"].values, index=df_countries["Capital"].values)
df_results_mean_semantique = df_results_mean_semantique.mean() # compute semantic distance average between every other Capitals
df_results_mean_semantique = df_results_mean_semantique.rename("average_semantic_distance")
df_results_mean_semantique = pd.DataFrame(df_results_mean_semantique)

df_country_mean_semantique = df_results_mean_semantique.join(df_countries.set_index("Capital"), how="inner")
df_country_mean_semantique = gpd.GeoDataFrame(df_country_mean_semantique, geometry='Coordinates')
df_country_mean_semantique


In [None]:
import matplotlib.pyplot as plt

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'), color='lightgrey', ax=ax)
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
world.plot(ax=ax, color='lightgrey')

df_country_mean_semantique.plot(column='average_semantic_distance', ax=ax, legend=True)
ax.set_title(f"mean_semantique by Country for {model_name}", fontsize=20)
plt.show()



## 4.2 Remote LLMs

In [None]:
import getpass
 
OPENAI_API_KEY = getpass.getpass(prompt="Your OpenAI API Key")

In [None]:
import openai
from langchain.embeddings import OpenAIEmbeddings

tok = 'cl100k_base',
model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

model_name = "text-embedding-ada-002"


In [None]:
def word_embedding(input_text):
    try:
        return np.array(model.embed_documents([input_text])[0])
    except: np.nan

In [None]:
df_countries["capital_embedding"] = df_countries["Capital"].apply(word_embedding)
df_countries = df_countries.dropna(subset=["capital_embedding"])

embedding_array = np.stack(df_countries["capital_embedding"].values)
semantic_distance_matrix = 1 - cosine_similarity(embedding_array, embedding_array)

df_results_mean_semantique = pd.DataFrame(semantic_distance_matrix, columns=df_countries["Capital"].values, index=df_countries["Capital"].values)
df_results_mean_semantique = df_results_mean_semantique.mean() # compute semantic distance average between every other Capitals
df_results_mean_semantique = df_results_mean_semantique.rename("average_semantic_distance")
df_results_mean_semantique = pd.DataFrame(df_results_mean_semantique)

df_country_mean_semantique = df_results_mean_semantique.join(df_countries.set_index("Capital"), how="inner")
df_country_mean_semantique = gpd.GeoDataFrame(df_country_mean_semantique, geometry='Coordinates')
df_country_mean_semantique

In [None]:
df_countries

In [None]:
import matplotlib.pyplot as plt

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'), color='lightgrey', ax=ax)
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
world.plot(ax=ax, color='lightgrey')

df_country_mean_semantique.plot(column='average_semantic_distance', ax=ax, legend=True)
ax.set_title(f"mean_semantique by Country for {model_name}", fontsize=20)
plt.show()
