# 2. Spatial information coverage in training datasets

**Authors**

| Author      | Affiliation            |
|-------------|------------------------|
| Rémy Decoupes    | INRAE / TETIS      |
| Mathieu Roche  | CIRAD / TETIS |
| Maguelonne Teisseire | INRAE / TETIS            |

![TETIS](https://www.umr-tetis.fr/images/logo-header-tetis.png)

In [None]:
from transformers import AutoTokenizer

```python
from transformers import BertModel, BertTokenizer
from transformers import RobertaTokenizer, RobertaModel


list_of_models = {
    'bert': {
        'name': 'bert-base-uncased',
        'tokenizer': BertTokenizer.from_pretrained('bert-base-uncased'),
        'model': BertModel.from_pretrained('bert-base-uncased'),
        'mask': "[MASK]",
        'type': "SLM"
    },
    'bert-base-multilingual-uncased':{
        'name': 'bert-base-multilingual-uncased',
        'tokenizer': AutoTokenizer.from_pretrained('bert-base-multilingual-uncased'),
        'model': BertModel.from_pretrained('bert-base-multilingual-uncased'),
        'mask': "[MASK]",
        'type': "SLM"
    },
    'roberta': {
        'name': 'roberta-base',
        'tokenizer': AutoTokenizer.from_pretrained('roberta-base'),
        'model': RobertaModel.from_pretrained('roberta-base'),
        'mask': "<mask>",
        'type': "SLM"
    },
    'xlm-roberta-base': {
        'name': 'xlm-roberta-base',
        'tokenizer': AutoTokenizer.from_pretrained('xlm-roberta-base'),
        'model': RobertaModel.from_pretrained('xlm-roberta-base'),
        'mask': "<mask>",
        'type': "SLM"
    },
    'mistral': {
        'name': 'mistralai/Mistral-7B-Instruct-v0.1',
        'type': "LLM_local"
    },
    'llama2': {
        'name': 'meta-llama/Llama-2-7b-chat-hf',
        'type': "LLM_local"
    },
    'chatgpt':{
        'name': 'gpt-3.5-turbo-0301',
        'type': "LLM_remote_api"
    },
}
```

**Initiate API Key**

- HuggingFace 
- OpenAI

In [None]:
import getpass
 
HF_API_TOKEN = getpass.getpass(prompt="Your huggingFace API Key")
OPENAI_API_KEY = getpass.getpass(prompt="Your OpenAI API Key")

**Geo Datasets**

In [None]:
!pip install countryinfo
!pip install shapely
!pip install geopandas
!pip install matplotlib

In [None]:
from countryinfo import CountryInfo
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import geopandas as gpd

country = CountryInfo()

countries = []
capitals = []
regions = []
subregions = []
coordinates = []

for c in list(country.all().keys()):
    country_info = CountryInfo(c)
    countries.append(c)
    try:
        regions.append(country_info.region())
    except:
        regions.append(np.NAN)
    try:
        subregions.append(country_info.subregion())
    except:
        subregions.append(np.NAN)
    try:
        if country_info.geo_json()["features"][0]["geometry"]["type"] == "Polygon":
          coordinates.append(Polygon(country_info.geo_json()["features"][0]["geometry"]["coordinates"][0]))
        else: #MultiPolygon : Take the biggest one
          polygons = country_info.geo_json()["features"][0]["geometry"]["coordinates"]
          max_polygon = max(polygons, key=lambda x: len(x[0]))
          coordinates.append(Polygon(max_polygon[0]))
    except:
        coordinates.append(np.NAN)
    try:
        capitals.append(country_info.capital())
    except:
        capitals.append(np.NAN)

# Create DataFrame
data = {
    'Country': countries,
    'Capital': capitals,
    'Region': regions,
    'Subregion': subregions,
    'Coordinates': coordinates
}

df_countries = pd.DataFrame(data)
df_countries = gpd.GeoDataFrame(df_countries, geometry='Coordinates')

# Display DataFrame
df_countries

## 2.1 SLMs

### 2.1.1 Example

Let's see if "Tapei" is part of Roberta-base vocabulary

In [None]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

print(f"Size of {model_name} vocabulary: {len(tokenizer.get_vocab())}")
tokenizer.get_vocab()

In [None]:
city = "Taipei"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

In [None]:
city = "London"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

### 2.1.2 Worldwide

In [None]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def in_vocab(city):
    result = False
    try:
        result = result or str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()
        result = result or city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()
    except:
        pass
    return result

df_countries["in_vocab"] = df_countries["Capital"].apply(in_vocab)
df_countries

In [None]:
accuracy_by_continent = df_countries.groupby('Region')[f"in_vocab"].mean() * 100
accuracy_by_continent

In [None]:
df_countries.plot("in_vocab", cmap="RdYlGn")

## 2.2 Local LLMs

### 2.2.1 Example


In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_API_TOKEN)

print(f"Size of {model_name} vocabulary: {len(tokenizer.get_vocab())}")
tokenizer.get_vocab()

In [None]:
city = "Taipei"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

In [None]:
city = "London"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

### 2.2.2 Worldwide

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_API_TOKEN)

def in_vocab(city):
    result = False
    try:
        result = result or str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()
        result = result or city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()
    except:
        pass
    return result

df_countries["in_vocab"] = df_countries["Capital"].apply(in_vocab)
df_countries


In [None]:
accuracy_by_continent = df_countries.groupby('Region')[f"in_vocab"].mean() * 100
accuracy_by_continent

In [None]:
df_countries.plot("in_vocab", cmap="RdYlGn")

## 2.3 Remote LLMs

### 2.3.1 Example

In [None]:
!pip install tiktoken
!pip install openai

In [None]:
import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [None]:
city = "Taipei"

tokenizer.encode(city)

In [None]:
len(tokenizer.encode(city)) > 1

In [None]:
for token in tokenizer.encode(city):
    print(f"token {token}: {tokenizer.decode([token])}")

In [None]:
city = "London"

tokenizer.encode(city)

### 2.3.2 Worldwide

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

def in_vocab(city):
    result = False
    try:
        if(len(tokenizer.encode(city)) == 1): # no subtokens
            result = True
    except:
        pass
    return result

df_countries["in_vocab"] = df_countries["Capital"].apply(in_vocab)

accuracy_by_continent = df_countries.groupby('Region')[f"in_vocab"].mean() * 100
accuracy_by_continent

In [None]:
df_countries.plot("in_vocab", cmap="RdYlGn")

## 2.4 *Going Further*: 

### 2.4.1 Using other LLMs

1. Evaluate other tokenizer form other LLMs like meta/Llama-3, Microsoft/Phi-3 or Alibaba/Qwen1.5
2. Use other remote API like Cohere or Groq

### 2.4.2 How to explain the very good geographic knowledge of LLMs when, upon questioning their vocabulary, they have few location?

**Hypothesis**: LLMs encountered many locations during their training, however, they are drowned out by the quantity of other words. As a result, the subtokens that make up the locations have a good geographical representation when merged.

To validate this hypothesis, we could evaluate the proportion of subtokens from LLM and SLM tokenizers.