# 2. Spatial information coverage in training datasets

**Authors**

| Author      | Affiliation            |
|-------------|------------------------|
| Rémy Decoupes    | INRAE / TETIS      |
| Mathieu Roche  | CIRAD / TETIS |
| Maguelonne Teisseire | INRAE / TETIS            |

![TETIS](https://www.umr-tetis.fr/images/logo-header-tetis.png)

In [None]:
# Installation
!pip install -U bitsandbytes
!pip install transformers==4.37.2
!pip install -U git+https://github.com/huggingface/peft.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install openai==0.28

In [1]:
from transformers import BertModel, BertTokenizer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

list_of_models = {
    'bert': {
        'name': 'bert-base-uncased',
        'tokenizer': BertTokenizer.from_pretrained('bert-base-uncased'),
        'model': BertModel.from_pretrained('bert-base-uncased'),
        'mask': "[MASK]",
        'type': "SLM"
    },
    'bert-base-multilingual-uncased':{
        'name': 'bert-base-multilingual-uncased',
        'tokenizer': AutoTokenizer.from_pretrained('bert-base-multilingual-uncased'),
        'model': BertModel.from_pretrained('bert-base-multilingual-uncased'),
        'mask': "[MASK]",
        'type': "SLM"
    },
    'roberta': {
        'name': 'roberta-base',
        'tokenizer': AutoTokenizer.from_pretrained('roberta-base'),
        'model': RobertaModel.from_pretrained('roberta-base'),
        'mask': "<mask>",
        'type': "SLM"
    },
    'xlm-roberta-base': {
        'name': 'xlm-roberta-base',
        'tokenizer': AutoTokenizer.from_pretrained('xlm-roberta-base'),
        'model': RobertaModel.from_pretrained('xlm-roberta-base'),
        'mask': "<mask>",
        'type': "SLM"
    },
    'mistral': {
        'name': 'mistralai/Mistral-7B-Instruct-v0.1',
        'type': "LLM_local"
    },
    'llama2': {
        'name': 'meta-llama/Llama-2-7b-chat-hf',
        'type': "LLM_local"
    },
    'chatgpt':{
        'name': 'gpt-3.5-turbo-0301',
        'type': "LLM_remote_api"
    },
}

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


**Initiate API Key**

- HuggingFace 
- OpenAI

In [18]:
import getpass
 
HF_API_TOKEN = getpass.getpass(prompt="Your huggingFace API Key")
OPENAI_API_KEY = getpass.getpass(prompt="Your OpenAI API Key")

## 2.1 SLMs

### 2.1.1 Example

Let's see if "Tapei" is part of Roberta-base vocabulary

In [35]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

print(f"Size of {model_name} vocabulary: {len(tokenizer.get_vocab())}")
tokenizer.get_vocab()

Size of roberta-base vocabulary: 50265


{'proxy': 47315,
 'Ġ960': 39811,
 'Products': 47559,
 'ardo': 6782,
 'ĠKar': 4077,
 'Its': 30872,
 'Ġevils': 40367,
 'ĠTechnologies': 5974,
 'ĠkWh': 42159,
 'Ġabdominal': 28670,
 'Ġproduces': 9108,
 'L': 574,
 'priv': 25943,
 '692': 39311,
 'Ġproc': 17987,
 'Ġribbon': 21041,
 'ĠSerbian': 24229,
 'ĠAGA': 32114,
 'strip': 34216,
 '--------------------': 47655,
 'Sword': 48728,
 '````': 49972,
 'ĠAlexandria': 15748,
 'ĠSeg': 17324,
 'Trans': 19163,
 'Ġriots': 21224,
 'Ġwhole': 1086,
 'Ġadditives': 36254,
 'Ġspectacle': 20286,
 'Ġballot': 5250,
 'ARP': 30711,
 'ateg': 27586,
 'Ġdisapprove': 38509,
 'iddles': 40741,
 'Ġholders': 9758,
 'money': 17479,
 'sector': 18658,
 'Ġnitrogen': 23040,
 'Ġpetitioner': 31390,
 'Instruct': 48493,
 'Bre': 31607,
 'ĠSikh': 24842,
 'imbabwe': 39329,
 'Ġdeepest': 19762,
 'Ġbeings': 14766,
 'oshop': 46491,
 'Ġpoliceman': 20976,
 'Hon': 35846,
 'ĠHave': 6319,
 '!': 328,
 'Ġliked': 6640,
 'ĠFest': 13326,
 'Personal': 43854,
 'Ġparaph': 40127,
 'ĠIS': 3703,
 'Ġsi

In [36]:
city = "Taipei"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

Is Taipei (without uppercase) in vocab ?: False
Is Taipei (with uppercase) in vocab ?: False


In [16]:
city = "London"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

Is London (without uppercase) in vocab ?: False
Is London (with uppercase) in vocab ?: True


## 2.2 Local LLMs

### 2.2.1 Example


In [33]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_API_TOKEN)

print(f"Size of {model_name} vocabulary: {len(tokenizer.get_vocab())}")
tokenizer.get_vocab()

Size of mistralai/Mistral-7B-Instruct-v0.1 vocabulary: 32000


{'iled': 4360,
 '▁Letter': 22279,
 '▁▁▁▁▁▁▁▁▁▁▁▁▁': 569,
 '--)': 26107,
 'xFF': 4570,
 '仁': 31257,
 '}]': 10157,
 'än': 5300,
 'яви': 20241,
 '="${': 28144,
 '<0x8B>': 142,
 '▁Ham': 5058,
 '▁Cra': 17129,
 '▁compilation': 26383,
 '<0xAD>': 176,
 'FAIL': 12546,
 '▁chang': 2265,
 '平': 29549,
 '▁conviction': 24594,
 ';;': 19406,
 'циона': 28412,
 '▁born': 5381,
 'izado': 25018,
 'utils': 7284,
 '▁Community': 11027,
 'endregion': 26034,
 'wall': 11653,
 '▁Delhi': 21548,
 '▁Leo': 19795,
 'gamma': 3933,
 'Profile': 8721,
 '▁App': 3122,
 '▁président': 25446,
 'ព': 31196,
 'fif': 20773,
 'constraint': 27122,
 'Current': 6086,
 '▁phenomenon': 20757,
 '散': 31426,
 '/*': 1477,
 '▁Direct': 6055,
 'patient': 27792,
 '),\\': 19908,
 'Present': 19618,
 '▁apart': 7413,
 'AK': 13715,
 '▁cv': 19342,
 '菜': 30038,
 '▁arrange': 23503,
 '간': 30112,
 '▁partners': 11796,
 'ろ': 31149,
 '▁pra': 13066,
 'Factor': 20169,
 '▁trailing': 27166,
 'լ': 30834,
 'prints': 25580,
 'enabled': 9474,
 'createElement': 17023,

In [34]:
city = "Taipei"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

Is Taipei (without uppercase) in vocab ?: False
Is Taipei (with uppercase) in vocab ?: False


In [23]:
city = "London"
print(f"Is {city} (without uppercase) in vocab ?: {str.lower(city) in tokenizer.get_vocab() or str.lower('Ġ' + city) in tokenizer.get_vocab()}")
print(f"Is {city} (with uppercase) in vocab ?: {city in tokenizer.get_vocab() or str('Ġ' + city) in tokenizer.get_vocab()}")

Is London (without uppercase) in vocab ?: False
Is London (with uppercase) in vocab ?: True


## 2.3 Remote LLMs

### 2.3.1 Example

In [29]:
!pip install tiktoken
!pip install openai

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.6.0
Collecting openai
  Downloading openai-1.16.2-py3-none-any.whl.metadata (21 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.3.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.6.4-py3-none-any.whl.metadata (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [31]:
import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [37]:
city = "Taipei"

tokenizer.encode(city)

[0, 41064, 24309, 2]

In [38]:
for token in tokenizer.encode(city):
    print(f"token {token}: {tokenizer.decode(token)}")

token 0: <s>
token 41064: Tai
token 24309: pei
token 2: </s>


In [39]:
city = "London"

tokenizer.encode(city)

[0, 23122, 2]