# How do we make software understand language?

In [1]:
import os

In [2]:
if not os.path.isfile("dutch.csv"): 
  !wget https://raw.githubusercontent.com/tijsg/lm-tutorial/main/dutch.csv

# New section

## Analogy with language tests
Fill in the missing word

## Encoding
Every string is already encoded character per character

In [3]:
for char in ("abcdefghijklmnopqrstuvwxyz"):
  print("UTF-8 code for char " + char + " is: " + str(ord(char)))

UTF-8 code for char a is: 97
UTF-8 code for char b is: 98
UTF-8 code for char c is: 99
UTF-8 code for char d is: 100
UTF-8 code for char e is: 101
UTF-8 code for char f is: 102
UTF-8 code for char g is: 103
UTF-8 code for char h is: 104
UTF-8 code for char i is: 105
UTF-8 code for char j is: 106
UTF-8 code for char k is: 107
UTF-8 code for char l is: 108
UTF-8 code for char m is: 109
UTF-8 code for char n is: 110
UTF-8 code for char o is: 111
UTF-8 code for char p is: 112
UTF-8 code for char q is: 113
UTF-8 code for char r is: 114
UTF-8 code for char s is: 115
UTF-8 code for char t is: 116
UTF-8 code for char u is: 117
UTF-8 code for char v is: 118
UTF-8 code for char w is: 119
UTF-8 code for char x is: 120
UTF-8 code for char y is: 121
UTF-8 code for char z is: 122


## Install dependencies

In [1]:
!pip3 install tokenizers==0.12.1 transformers==4.21.3 pandas==1.4.4 datasets==2.4.0 tensorflow==2.12
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://download.pytorch.org/whl/cu118, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


## Load in data

In [6]:
import pandas as pd

In [7]:
df_dutch = pd.read_csv("dutch.csv", delimiter=";")

In [8]:
df_dutch

Unnamed: 0,labels
0,Spanje is met ingang van vandaag voorzitter va...
1,Vijf werknemers van het omstreden Amerikaanse ...
2,Het Oud en Nieuwfeest op het Museumplein in Am...
3,President Obama heeft de eerste rapporten gekr...
4,In de hele wereld is het nieuwe jaar feestelij...
...,...
9973,Een Chinese trein heeft het snelheidsrecord ge...
9974,Een universiteit in de Amerikaanse staat Texas...
9975,In Brussel demonstreren tienduizenden mensen v...
9976,De NS wil het papieren spoorboekje afschaffen....


## Split by space

In [9]:
sentence = df_dutch.iloc[0]
sentence = sentence[0]

In [10]:
sentence.split(" ")[:10]

['Spanje',
 'is',
 'met',
 'ingang',
 'van',
 'vandaag',
 'voorzitter',
 'van',
 'de',
 'EU.']

## Tokenizers 
### Byte pair encoding

In [11]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files="dutch.csv", vocab_size=50265, min_frequency=3, 
                show_progress=True,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

In [12]:
import os

if not os.path.exists("tokenizer"): os.mkdir("tokenizer")
tokenizer.save_model("tokenizer")

['tokenizer/vocab.json', 'tokenizer/merges.txt']

In [6]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("tokenizer", max_len=512)

In [14]:
tokens = tokenizer.encode(sentence[:100])
print(sentence[:100])
print(tokens)

Spanje is met ingang van vandaag voorzitter van de EU. De Zweedse premier Fredrik Reinfeldt heeft he
[0, 17121, 321, 351, 8590, 284, 880, 2079, 284, 268, 2025, 18, 328, 4668, 1070, 10555, 1137, 17786, 402, 425, 2]


In [15]:
print(tokenizer.decode([0, 17124, 3065, 284, 293, 4452, 847, 44286, 42429, 26012, 672, 16733, 329, 605, 225, 2]))

<s>Vijf werknemers van het omstreden Amerikaanse beveilingingsbedrijf Blackwater gaan vrijuit voor hun </s>


## Configure RoBERTa model

In [16]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=50265,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  82170201


In [17]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15,
)

In [18]:
# !pip install --upgrade accelerate

In [19]:
import torch

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    c = torch.cuda.device_count()
    print(f"CUDA is available!\nNumber of gpu's detected: {c}" )
    print('Training on GPU ...\n\n')
    for i in range(c):
        print(f'Device name {i}== {torch.cuda.get_device_name(i)}')

CUDA is available!
Number of gpu's detected: 1
Training on GPU ...


Device name 0== Tesla T4


In [20]:
def encode(batch):
    return tokenizer(batch['labels'], padding="max_length", truncation=True, max_length=512,return_tensors="pt")

In [21]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="dutch.csv", delimiter=";")



  0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels'],
        num_rows: 9978
    })
})

In [23]:
dataset = dataset.shuffle(seed=42)
dataset = dataset["train"]
dataset = dataset.train_test_split(train_size=0.7)



In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels'],
        num_rows: 6984
    })
    test: Dataset({
        features: ['labels'],
        num_rows: 2994
    })
})

In [25]:
dataset["train"][100]

{'labels': 'In de stad Osj in KirgizieÌˆ is geschoten tijdens een bijeenkomst van aanhangers van de verdreven president Bakijev. De schoten werden gelost toen Bakijev een speech hield. Hij brak zijn toespraak af en verliet de bijeenkomst. Wie er achter de schietpartij zit, is niet bekend. Bakijev raakte niet gewond. Volgens een Russisch persbureau zouden medewerkers van de veiligheidsdienst van het gevluchte staatshoofd in een reactie in de lucht hebben geschoten om Bakijev te beschermen. Bakijev vluchtte vorige week naar het zuiden van KirgizieÌˆ, na hevige onlusten in de hoofdstad Bisjkek. In het zuiden heeft hij nog veel aanhang. Hij wil praten met oppositieleider Otoenbajeva over een machtsoverdracht, maar eist daarbij veiligheidsgaranties voor zichzelf en zijn familie. Vorige week eiste Roza Otoenbajeva de macht in KirgizieÌˆ op. Ze zegt dat ze een interim-regering heeft gevormd. Zij vindt dat Bakijev schuldig is aan de doden die vorige week vielen bij de onlusten in KirgizieÌˆ. T

In [26]:
dataset["test"][100]

{'labels': 'Tsjaad is niet van plan de Sudanese president Bashir op te pakken, die daar nu op bezoek is. Volgens de minister van Binnenlandse Zaken is Tsjaad niet verplicht om de van oorlogsmisdaden verdachte president te arresteren. Het Internationaal Strafhof in Den Haag heeft een arrestatiebevel uitgevaardigd tegen hem, onder meer voor genocide in Darfur. Volgens het hof is Tsjaad wel degelijk verplicht om Bashir aan te houden, omdat het lid is van het hof. Bashir is in Tsjaad voor een top van Sahara-landen. Hij werd bij aankomst op het vliegveld vriendelijk begroet door president DeÌ\x81by.'}

In [27]:
ds_train = dataset["train"]
ds_train.set_transform(encode)
ds_test = dataset["test"]
ds_test.set_transform(encode)

In [28]:
ds_train[20]

{'input_ids': tensor([    0,   792,  1740,   284,  2210,  1527,   321,   309,  3173,   444,
         10589,  1539,   289,  5311,    18,  6694, 10976,  2210,  1527,   468,
          2920,   351,  4052,  1539,    16,   464,   319,   886,   321,  3194,
          4066,    18,  1556,  1491,   479,   268,   981,  1083,   596,  4538,
         21966,   284,   524,   367,   605, 29871,   289,  5311,   369,   634,
          4009,     6,    16,   581,  1944,   408, 17535, 29543,   284,  2210,
          1527,    18,  1556,   792,   632,   356,  4052,   524,   367,   289,
          5311,  3167,   307,  6768,  4500,    18,   419,   321,   468,  2056,
           356,   289,   320, 15230,   574,   353,   559,   289,  7700,  4224,
         17459,    16,   651,   456,   507,   570,   607,   468,   566,  2920,
           351,   353,   634,   985,     6,    16,  1609, 29543,    18, 39508,
         19795,   419,  1740,  1749,   268,  1539,   566, 12769,    16,  1966,
           368,   340,   566,  5854,   

In [29]:
from transformers import TrainingArguments, Trainer

if not os.path.exists("models"): os.mkdir("models")
if not os.path.exists("models/roberta"): os.mkdir("models/roberta")

training_args = TrainingArguments(
    output_dir='./models/roberta',
    overwrite_output_dir=True,
    evaluation_strategy = 'steps',
    num_train_epochs=100,
    learning_rate=1e-5,
    lr_scheduler_type="constant",
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=1024,
    eval_steps=1024,
    save_total_limit=3,
    ignore_data_skip=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    #prediction_loss_only=True,
)

Using cuda_amp half precision backend


In [30]:
trainer.train()
trainer.save_model("models/roberta/roberta_dutch")

***** Running training *****
  Num examples = 6984
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 218


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to models/roberta/roberta_dutch
Configuration saved in models/roberta/roberta_dutch/config.json
Model weights saved in models/roberta/roberta_dutch/pytorch_model.bin


In [4]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM.from_pretrained("models/roberta/roberta_dutch")

In [7]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model = model,
    tokenizer = tokenizer
)

In [10]:
fill_mask("ik ga naar <mask> vandaag")

[{'score': 0.04218944162130356,
  'token': 18,
  'token_str': '.',
  'sequence': 'ik ga naar. vandaag'},
 {'score': 0.035313669592142105,
  'token': 268,
  'token_str': ' de',
  'sequence': 'ik ga naar de vandaag'},
 {'score': 0.010522267781198025,
  'token': 16,
  'token_str': ',',
  'sequence': 'ik ga naar, vandaag'},
 {'score': 0.00925430003553629,
  'token': 284,
  'token_str': ' van',
  'sequence': 'ik ga naar van vandaag'},
 {'score': 0.006261906586587429,
  'token': 293,
  'token_str': ' het',
  'sequence': 'ik ga naar het vandaag'}]