In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import AutoModelForTokenClassification, AutoTokenizer
import numpy as np
import time
import random
import functools

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
def read_conllu(file_path):
    df = pd.DataFrame(columns=["id",'tokens', 'ner_tags'])
    Pos_Tags = ''
    sentence = ''

    with open(file_path, 'r', encoding='utf-8') as file:
        k = 0
        for line in file:
            line = line.strip()
            if line == '':
                if (sentence != ''):
                    sentence = sentence.lstrip()
                    Pos_Tags = Pos_Tags.lstrip()
                    new_row = {"id": k,'tokens': sentence, 'ner_tags': Pos_Tags}
                    df = df.append(new_row, ignore_index=True)
                    k +=1
                    Pos_Tags = ''
                    sentence = ''
            elif line.startswith('#'):
                continue
            else:
                tokens = line.split('\t')
                word = tokens[1]
                POS_tag = tokens[3]
                if (POS_tag != "SYM" and POS_tag != "PART" and POS_tag != "_"):
                    sentence = sentence + " " + word
                    Pos_Tags = Pos_Tags + " " + POS_tag
    return df

In [4]:
train_df = read_conllu("data/UD_Italian-VIT-master/it_vit-ud-train.conllu")
train_df

Unnamed: 0,id,tokens,ner_tags
0,0,Le infrastrutture come fattore di competitivit...,DET NOUN ADP NOUN ADP NOUN ADP PROPN PROPN PUNCT
1,1,In gli ultimi anni la dinamica di i polo di at...,ADP DET ADJ NOUN DET NOUN ADP DET NOUN ADP NOU...
2,2,Il raggiungimento e il mantenimento di posizio...,DET NOUN CCONJ DET NOUN ADP NOUN ADJ AUX ADV A...
3,3,Quest' ultimo è funzione di variabili struttur...,DET ADJ AUX NOUN ADP NOUN ADJ ADP PRON DET NOU...
4,4,"Il contesto milanese , se da un lato è stato t...",DET NOUN ADJ PUNCT SCONJ ADP DET NOUN AUX AUX ...
...,...,...,...
8272,8272,Premio Elsa Morante .,NOUN PROPN PROPN PUNCT
8273,8273,È nato il premio Elsa Morante che verrà assegn...,AUX VERB DET NOUN PROPN PROPN PRON AUX VERB DE...
8274,8274,Questo Premio che non avrà sede fissa né statu...,DET PROPN PRON ADV VERB NOUN ADJ ADV NOUN ADV ...
8275,8275,sono promotori di l' iniziativa Patrizia Caval...,AUX NOUN ADP DET NOUN PROPN PROPN PUNCT PROPN ...


In [5]:
test_df = read_conllu("data/UD_Italian-VIT-master/it_vit-ud-test.conllu")
test_df

Unnamed: 0,id,tokens,ner_tags
0,0,Non sono consentite assegnazioni provvisorie i...,ADV AUX VERB NOUN ADJ ADP DET NOUN ADP DET NOU...
1,1,È consentita inoltre la partecipazione provvis...,AUX VERB ADV DET NOUN ADJ ADV ADP DET ADJ NOUN...
2,2,I predetti motivi devono costituire oggetto di...,DET ADJ NOUN AUX VERB NOUN ADP NOUN PUNCT PRON...
3,3,In caso di ricongiungimento a il familiare des...,ADP NOUN ADP NOUN ADP DET NOUN VERB ADP ADJ NO...
4,4,A i fini di la possibilità di presentazione di...,ADP DET NOUN ADP DET NOUN ADP NOUN ADP DET NOU...
...,...,...,...
1062,1062,Scrooge era il suo unico esecutore testamentar...,PROPN AUX DET DET ADJ NOUN ADJ PUNCT ADJ NOUN ...
1063,1063,"Anzi il nostro Scrooge , che per verità il tri...",CCONJ DET DET PROPN PUNCT PRON ADP NOUN DET AD...
1064,1064,Il ricordo di i funerali mi fa tornare a il pu...,DET NOUN ADP DET NOUN PRON VERB VERB ADP DET N...
1065,1065,Non c' è dunque dubbio che Marley era morto .,ADV PRON AUX ADV NOUN SCONJ PROPN AUX VERB PUNCT


In [6]:
dev_df = read_conllu("data/UD_Italian-VIT-master/it_vit-ud-dev.conllu")
dev_df

Unnamed: 0,id,tokens,ner_tags
0,0,"Ha l' acqua calda , più o meno si veste .",VERB DET NOUN ADJ PUNCT ADV CCONJ ADV PRON VER...
1,1,malgrado le guerre e i disastri naturali e pol...,ADP DET NOUN CCONJ DET NOUN ADJ CCONJ ADJ PUNC...
2,2,È come un' energia che sta crescendo complessi...,AUX ADP NUM NOUN PRON AUX VERB ADV PUNCT PUNCT
3,3,"L' onorevole Charles Rose , deputato democrati...",DET NOUN PROPN PROPN PUNCT NOUN ADJ ADP DET PR...
4,4,"Da qualche tempo , la sua espressione preferit...",ADP DET ADV PUNCT DET DET NOUN ADJ AUX VERB PUNCT
...,...,...,...
738,738,Le gravi esigenze di salute di l' aspirante a ...,DET ADJ NOUN ADP NOUN ADP DET NOUN ADP DET NOU...
739,739,"Possono chiedere l' assegnazione provvisoria ,...",AUX VERB DET NOUN ADJ PUNCT ADV ADP DET NOUN A...
740,740,La relativa domanda va formulata contestualmen...,DET ADJ NOUN AUX VERB ADV ADP PRON ADP NOUN PU...
741,741,Possono partecipare a il movimento di le asseg...,AUX VERB ADP DET NOUN ADP DET NOUN ADJ ADV DET...


In [7]:
all_pos_tags = [tags.split() for tags in train_df['ner_tags']]
flat_pos_tags = [tag for sublist in all_pos_tags for tag in sublist]
unique_pos_tags = set(flat_pos_tags)
unique_pos_tags = list(unique_pos_tags)
print(unique_pos_tags)

['CCONJ', 'NUM', 'INTJ', 'X', 'PUNCT', 'PROPN', 'SCONJ', 'ADJ', 'DET', 'AUX', 'ADP', 'PRON', 'NOUN', 'VERB', 'ADV']


In [8]:
pos_mapping_dict = {'X':0, 'NUM':1, 'AUX':2, 'PRON':3, 'ADP':4, 'ADJ':5,
                    'VERB':6, 'CCONJ':7, 'PUNCT':8, 'SCONJ':9, 'ADV':10, 'INTJ':11, 'PROPN':12, 'DET':13, 'NOUN':14}
def map_pos_to_category(pos_tags):
    return [pos_mapping_dict[tag] for tag in pos_tags]

In [9]:
train_df['tokens'] = train_df['tokens'].str.split()
test_df['tokens'] = test_df['tokens'].str.split()
dev_df['tokens'] = dev_df['tokens'].str.split()
train_df['ner_tags'] = train_df['ner_tags'].str.split()
test_df['ner_tags'] = test_df['ner_tags'].str.split()
dev_df['ner_tags'] = dev_df['ner_tags'].str.split()
train_df['ner_tags'] = train_df['ner_tags'].apply(map_pos_to_category)
test_df['ner_tags'] = test_df['ner_tags'].apply(map_pos_to_category)
dev_df['ner_tags'] = dev_df['ner_tags'].apply(map_pos_to_category)
train_df.head(5)

Unnamed: 0,id,tokens,ner_tags
0,0,"[Le, infrastrutture, come, fattore, di, compet...","[13, 14, 4, 14, 4, 14, 4, 12, 12, 8]"
1,1,"[In, gli, ultimi, anni, la, dinamica, di, i, p...","[4, 13, 5, 14, 13, 14, 4, 13, 14, 4, 14, 2, 2,..."
2,2,"[Il, raggiungimento, e, il, mantenimento, di, ...","[13, 14, 7, 13, 14, 4, 14, 5, 2, 10, 10, 13, 1..."
3,3,"[Quest', ultimo, è, funzione, di, variabili, s...","[13, 5, 2, 14, 4, 14, 5, 4, 3, 13, 14, 8, 13, ..."
4,4,"[Il, contesto, milanese, ,, se, da, un, lato, ...","[13, 14, 5, 8, 9, 4, 13, 14, 2, 2, 14, 5, 4, 1..."


In [10]:
from datasets import Dataset, DatasetDict

# Assuming train_df, test_df, and dev_df are your dataframes
# Replace the column names if they are different in your dataframes

train_dataset = Dataset.from_pandas(train_df[['id', 'tokens', 'ner_tags']])
validation_dataset = Dataset.from_pandas(dev_df[['id', 'tokens', 'ner_tags']])
test_dataset = Dataset.from_pandas(test_df[['id', 'tokens', 'ner_tags']])

# Create DatasetDict
POS_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

# Print dataset_dict information
print(POS_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 8277
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 743
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1067
    })
})


In [11]:
model_name = "osiria/distilbert-base-italian-cased"

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
tokenized_input = tokenizer(POS_dict["train"][0]["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(POS_dict["train"][0]["tokens"])
print(tokens)

['Le', 'infrastrutture', 'come', 'fattore', 'di', 'competitività', 'di', 'Angela', 'Airoldi', '.']
['[CLS]', 'Le', 'in', '##fra', '##stru', '##ttu', '##re', 'come', 'fatto', '##re', 'di', 'com', '##pet', '##iti', '##vità', 'di', 'Angela', 'Air', '##old', '##i', '.', '[SEP]']


In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
tokenized_POS = POS_dict.map(tokenize_and_align_labels, batched=True)
tokenized_train_dataset = POS_dict['train'].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/8277 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/1067 [00:00<?, ? examples/s]

Map:   0%|          | 0/8277 [00:00<?, ? examples/s]

In [16]:
tokenized_train_dataset[1]

{'id': 1,
 'tokens': ['In',
  'gli',
  'ultimi',
  'anni',
  'la',
  'dinamica',
  'di',
  'i',
  'polo',
  'di',
  'attrazione',
  'è',
  'stata',
  'sempre',
  'più',
  'caratterizzata',
  'da',
  "l'",
  'emergere',
  'di',
  'una',
  'crescente',
  'concorrenza',
  'che',
  'si',
  'è',
  'progressivamente',
  'spostata',
  'da',
  'le',
  'singole',
  'imprese',
  'a',
  'i',
  'sistemi',
  'economici',
  'e',
  'territoriali',
  ',',
  'determinando',
  "l'",
  'esigenza',
  'di',
  'una',
  'riconsiderazione',
  'di',
  'i',
  'rapporti',
  'esistenti',
  'tra',
  'soggetti',
  'produttivi',
  'e',
  'ambiente',
  'in',
  'cui',
  'questi',
  'operano',
  '.'],
 'ner_tags': [4,
  13,
  5,
  14,
  13,
  14,
  4,
  13,
  14,
  4,
  14,
  2,
  2,
  10,
  10,
  6,
  4,
  13,
  14,
  4,
  13,
  5,
  14,
  3,
  3,
  2,
  10,
  6,
  4,
  13,
  5,
  14,
  4,
  13,
  14,
  5,
  7,
  5,
  8,
  6,
  13,
  14,
  4,
  13,
  14,
  4,
  13,
  14,
  6,
  4,
  14,
  5,
  7,
  14,
  4,
  3,
  3,


In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)




In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(unique_pos_tags))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at osiria/distilbert-base-italian-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
import optuna
import datetime

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 48, 64])
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    training_args = TrainingArguments(
        output_dir = f'./results_{timestamp}',
        num_train_epochs=5,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size*2,
        weight_decay=weight_decay,
        logging_dir='./logs',
        warmup_steps=500, 
        eval_steps=60 - (per_device_train_batch_size/8)*5,
        save_steps=60 - (per_device_train_batch_size/8)*5,
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        save_total_limit=3
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_POS["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.005)]
    )

    trainer.train()
    
    # You can customize the metric for optimization based on your task
    # For example, if it's a classification task, you might use accuracy
    return trainer.evaluate()["eval_loss"]

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

best_params = study.best_params
print("Best hyperparameters:", best_params)

# Now, you can use the best hyperparameters to train your model with the full dataset


[I 2024-03-11 19:03:15,424] A new study created in memory with name: no-name-f873d595-c3af-4d86-ba3c-bc00c28fc1f0


Step,Training Loss,Validation Loss
40,No log,2.70816
80,No log,2.484555
120,No log,2.030563
160,No log,1.286876
200,No log,0.684267
240,No log,0.362242
280,No log,0.242601
320,No log,0.192182
360,No log,0.159505
400,No log,0.1473


[I 2024-03-11 19:42:13,759] Trial 0 finished with value: 0.09722351282835007 and parameters: {'learning_rate': 1.0249026227705883e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.01973660016800799}. Best is trial 0 with value: 0.09722351282835007.


Step,Training Loss,Validation Loss
20,No log,0.095292
40,No log,0.095791
60,No log,0.094479
80,No log,0.095141


[I 2024-03-11 19:53:40,374] Trial 1 finished with value: 0.09447883069515228 and parameters: {'learning_rate': 2.729321627524579e-05, 'per_device_train_batch_size': 64, 'weight_decay': 0.141197335916129}. Best is trial 1 with value: 0.09447883069515228.


Step,Training Loss,Validation Loss
30,No log,0.094904
60,No log,0.09493
90,No log,0.093868
120,No log,0.09346


[I 2024-03-11 20:05:07,844] Trial 2 finished with value: 0.0934603288769722 and parameters: {'learning_rate': 1.059289101998322e-05, 'per_device_train_batch_size': 48, 'weight_decay': 0.1465102967520619}. Best is trial 2 with value: 0.0934603288769722.


Step,Training Loss,Validation Loss
30,No log,0.093533
60,No log,0.093593
90,No log,0.092752
120,No log,0.091836


[I 2024-03-11 20:16:35,753] Trial 3 finished with value: 0.09183637797832489 and parameters: {'learning_rate': 1.7751920422433952e-05, 'per_device_train_batch_size': 48, 'weight_decay': 0.0013723243288475072}. Best is trial 3 with value: 0.09183637797832489.


Step,Training Loss,Validation Loss
30,No log,0.093002
60,No log,0.091731
90,No log,0.091839
120,No log,0.090699


[I 2024-03-11 20:28:05,467] Trial 4 finished with value: 0.09069878607988358 and parameters: {'learning_rate': 3.984465558393281e-05, 'per_device_train_batch_size': 48, 'weight_decay': 0.07262490008133415}. Best is trial 4 with value: 0.09069878607988358.


Best hyperparameters: {'learning_rate': 3.984465558393281e-05, 'per_device_train_batch_size': 48, 'weight_decay': 0.07262490008133415}


In [None]:
best_params = {'learning_rate': 3.984465558393281e-05, 'per_device_train_batch_size': 48, 'weight_decay': 0.07262490008133415}
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],   
    per_device_eval_batch_size=best_params['per_device_train_batch_size']*2,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs',
    warmup_steps=500, 
    eval_steps=60 - (best_params['per_device_train_batch_size']/8)*5,
    save_steps=60 - (best_params['per_device_train_batch_size']/8)*5,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    save_total_limit = 3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_POS["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.005)]
)

trainer.train()

In [20]:
trainer.evaluate()

{'eval_loss': 0.09409265220165253}

In [30]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
def inverse_dictionary(original_dict):
    inverted_dict = {}
    for key, value in original_dict.items():
        inverted_dict[value] = key
    return inverted_dict
pos_mapping_dict = inverse_dictionary(pos_mapping_dict)

def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0][0].softmax(1)
    # executing argmax function to get the candidate tags
    tokens_n_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), pos_mapping_dict[tag_id.item()]) 
                  for i, tag_id in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(tokens_n_tags, columns=['token', 'tag'])

In [33]:
# Example #1
text1 = """
La dolce vita è un film classico italiano che ha lasciato un'impronta indelebile nella storia del cinema ."""

get_prediction(text1).iloc[1:-1]

Unnamed: 0,token,tag
1,La,DET
2,dolce,ADJ
3,vita,NOUN
4,è,AUX
5,un,DET
6,film,NOUN
7,classico,ADJ
8,italiano,ADJ
9,che,PRON
10,ha,AUX


In [66]:
# Example #2
text2 = """
La primavera è la mia stagione preferita perché tutto sboccia e diventa colorato ."""

get_prediction(text2).iloc[1:-1]

Unnamed: 0,token,tag
1,La,DET
2,primavera,NOUN
3,è,AUX
4,la,DET
5,mia,DET
6,stagione,NOUN
7,pre,VERB
8,##ferita,VERB
9,perché,SCONJ
10,tutto,PRON


In [67]:
# Example #3
text3 = """
Oggi ho imparato una nuova ricetta e ho cucinato una deliziosa cena per la mia famiglia ."""

get_prediction(text3).iloc[1:-1]

Unnamed: 0,token,tag
1,Oggi,ADV
2,ho,AUX
3,im,VERB
4,##para,VERB
5,##to,VERB
6,una,DET
7,nuova,ADJ
8,rice,NOUN
9,##tta,NOUN
10,e,CCONJ


In [68]:
# Example #4
text4 = """
Ho trascorso le vacanze estive in una piccola città costiera piena di tradizione ."""

get_prediction(text4).iloc[1:-1]

Unnamed: 0,token,tag
1,Ho,AUX
2,tras,VERB
3,##corso,VERB
4,le,DET
5,va,NOUN
6,##can,NOUN
7,##ze,NOUN
8,esti,ADJ
9,##ve,ADJ
10,in,ADP


In [69]:
# Example #5
text5 = """
La cattedrale di Milano è un capolavoro architettonico che lascia senza fiato ."""

get_prediction(text5).iloc[1:-1]

Unnamed: 0,token,tag
1,La,DET
2,cattedrale,NOUN
3,di,ADP
4,Milano,PROPN
5,è,AUX
6,un,DET
7,capo,NOUN
8,##lavo,NOUN
9,##ro,NOUN
10,architetto,ADJ


In [70]:
# Example #6
text6 = """
Le montagne delle Dolomiti offrono paesaggi mozzafiato e sono ideali per gli amanti del trekking ."""

get_prediction(text6).iloc[1:-1]

Unnamed: 0,token,tag
1,Le,DET
2,montagne,NOUN
3,delle,DET
4,Dol,PROPN
5,##omi,PROPN
6,##ti,PROPN
7,off,VERB
8,##rono,VERB
9,pa,NOUN
10,##esa,NOUN
