# Imports

In [None]:
!pip install datasets
!pip install seqeval

In [2]:
import json
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from seqeval.scheme import IOB2
import numpy as np

# Loading the dataset

In [3]:
# File paths
train_file = 'NER-TRAINING.jsonlines'
validation_file = 'NER-VALIDATION.jsonlines'
test_file = 'NER-TESTING.jsonlines'
predicted_file_test = 'ner-testing-predictions.jsonlines'
predicted_file_valid = 'ner-validation-predictions.jsonlines'

In [4]:
# Function to read JSON Lines file
def read_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

# Function to write JSON Lines file
def write_jsonl(file_path, data):
    with open(file_path, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")

In [56]:
# Load datasets
train_data_orig = read_jsonl(train_file)
valid_data_orig = read_jsonl(validation_file)
test_data_orig = read_jsonl(test_file)

print("Size of train data :",len(train_data_orig))
print("Size of validation data :",len(valid_data_orig))

Size of train data : 4876
Size of validation data : 1044


# Preparing the data

## Cleaning datasets

In [31]:
# Clean data by removing entries with only 'O' tags
def clean_data(dataset):
    return [entry for entry in dataset if any(tag != 'O' for tag in entry.get('ner_tags', []))]

# Clean data by removing entries with 50% or more 'O' tags
def clean_data2(dataset):
    cleaned_dataset = []
    for entry in dataset:
        ner_tags = entry.get('ner_tags', [])
        if ner_tags:
            o_count = ner_tags.count('O')
            o_ratio = o_count / len(ner_tags)
            if o_ratio < 0.6:
                cleaned_dataset.append(entry)
    return cleaned_dataset

In [57]:
# Clean training and validation data
train_data = clean_data(train_data_orig)
valid_data = clean_data(valid_data_orig)

print("Size of train data after cleaning :",len(train_data))
print("Size of validation data after cleaning :",len(valid_data))

Size of train data after cleaning : 1352
Size of validation data after cleaning : 282


In [33]:
# Vizualisation
entry = train_data[0] # returns a dict object
print(entry)
print(entry['ner_tags'])

{'unique_id': 2600, 'tokens': ['Then', 'it', 'writes', 'the', 'file', 'to', 'disk', 'with', 'CreateFileA', 'and', 'WriteFile', 'functions', '.'], 'ner_tags': ['O', 'B-Entity', 'B-Action', 'B-Entity', 'I-Entity', 'B-Modifier', 'B-Entity', 'B-Modifier', 'B-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'O']}
['O', 'B-Entity', 'B-Action', 'B-Entity', 'I-Entity', 'B-Modifier', 'B-Entity', 'B-Modifier', 'B-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'O']


## Encoding labels

In [9]:
label_to_id = {
    "O": 0,
    "B-Entity": 1,
    "I-Entity": 2,
    "B-Action": 3,
    "I-Action": 4,
    "B-Modifier": 5,
    "I-Modifier": 6}

In [10]:
def encode_labels(data):
    encoded_data = []
    for entry in data:
        tokens = entry["tokens"]  # Second item is the list of tokens
        ner_tags = entry["ner_tags"]  # Third item is the list of NER tags
        encoded_tags = [label_to_id[tag] for tag in ner_tags]  # Convert tags to IDs
        encoded_data.append({"tokens": tokens, "ner_tags": encoded_tags})
    return encoded_data

In [58]:
train_data = encode_labels(train_data)
valid_data = encode_labels(valid_data)

In [35]:
# Vizualisation
print(entry)
print(entry['ner_tags'])

{'unique_id': 2600, 'tokens': ['Then', 'it', 'writes', 'the', 'file', 'to', 'disk', 'with', 'CreateFileA', 'and', 'WriteFile', 'functions', '.'], 'ner_tags': ['O', 'B-Entity', 'B-Action', 'B-Entity', 'I-Entity', 'B-Modifier', 'B-Entity', 'B-Modifier', 'B-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'O']}
['O', 'B-Entity', 'B-Action', 'B-Entity', 'I-Entity', 'B-Modifier', 'B-Entity', 'B-Modifier', 'B-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'O']


## Tokenizing labels

In [13]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [14]:
# Vizualisation
input = tokenizer(entry["tokens"], is_split_into_words=True)
print(input.tokens())

['[CLS]', 'In', '2008', ',', 'Tom', 'Don', '##ahu', '##e', ',', 'a', 'senior', 'Central', 'Intelligence', 'Agency', '(', 'CIA', ')', 'official', 'told', 'a', 'meeting', 'of', 'utility', 'company', 'representatives', 'that', 'c', '##y', '##ber', '##att', '##ack', '##s', 'had', 'taken', 'out', 'power', 'equipment', 'in', 'multiple', 'cities', 'outside', 'the', 'United', 'States', '.', '[SEP]']


In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [59]:
# Convert lists to Dataset objects
train_data = Dataset.from_dict({"tokens": [x["tokens"] for x in train_data], "ner_tags": [x["ner_tags"] for x in train_data]})
valid_data = Dataset.from_dict({"tokens": [x["tokens"] for x in valid_data], "ner_tags": [x["ner_tags"] for x in valid_data]})
test_data = Dataset.from_dict({"tokens": [x["tokens"] for x in test_data_orig]})

In [60]:
tokenized_train = train_data.map(
    tokenize_and_align_labels,
    batched=True)

tokenized_valid = valid_data.map(
    tokenize_and_align_labels,
    batched=True)

Map:   0%|          | 0/1352 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

In [18]:
# Vizualisation
print(tokenized_train)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1352
})


In [61]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [62]:
id2label = {v: k for k, v in label_to_id.items()}  # Inverting the label_to_id mapping
label2id = label_to_id  # Label-to-ID mapping already exists

print("Label to ID:", label2id)
print("ID to Label:", id2label)

Label to ID: {'O': 0, 'B-Entity': 1, 'I-Entity': 2, 'B-Action': 3, 'I-Action': 4, 'B-Modifier': 5, 'I-Modifier': 6}
ID to Label: {0: 'O', 1: 'B-Entity', 2: 'I-Entity', 3: 'B-Action', 4: 'I-Action', 5: 'B-Modifier', 6: 'I-Modifier'}


# Training the model

In [49]:
# Initialize the model
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    id2label=id2label,
    label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    # learning_rate=2e-5,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,
    # num_train_epochs=3,
    # weight_decay=0.01,
    report_to=[])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator)

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.3479


TrainOutput(global_step=507, training_loss=0.34504585611749683, metrics={'train_runtime': 79.7479, 'train_samples_per_second': 50.86, 'train_steps_per_second': 6.358, 'total_flos': 145552955643216.0, 'train_loss': 0.34504585611749683, 'epoch': 3.0})

In [64]:
trainer.save_model("./saved_model")

# Making predictions

In [67]:
# Using the model's checkpoint previously made
model_checkpoint = "./saved_model"

token_classifier = pipeline("token-classification", model=model_checkpoint)

def predict_for_data(dataset):
    predictions = []

    for entry in dataset:
        # Construct the phrase from tokens
        phrase = " ".join(entry['tokens'])

        # Use the classifier to predict tags for the phrase
        classification_results = token_classifier(phrase)

        # Initialize predicted tags as 'O' for every token
        predicted_tags = ["O"] * len(entry['tokens'])

        # Character-level tracking
        current_char_pos = 0

        for idx, token in enumerate(entry['tokens']):
            # Find the start and end of the token in the phrase
            token_start = current_char_pos
            token_end = token_start + len(token)

            # Check if any prediction matches this token's span
            for prediction in classification_results:
                if prediction['start'] >= token_start and prediction['end'] <= token_end:
                    # Assign the predicted entity to the token
                    predicted_tags[idx] = prediction['entity']
                    break

            # Update the character position (the space after the token)
            current_char_pos = token_end + 1

        # Append the result
        predictions.append({
            'unique_id': entry['unique_id'],
            'tokens': entry['tokens'],
            'ner_tags': predicted_tags})

    return predictions

# PREDICTIONS ON TESTING DATA
predictions = predict_for_data(test_data_orig)
write_jsonl(predicted_file_test, predictions) # write predictions on jsonl file
print(f"Predictions saved to {predicted_file_test}")

# PREDICTIONS ON VALIDATION DATA
predictions = predict_for_data(valid_data_orig)
write_jsonl(predicted_file_valid, predictions)
print(f"Predictions saved to {predicted_file_valid}")

Device set to use cuda:0


Predictions saved to ner-testing-predictions.jsonlines
Predictions saved to ner-validation-predictions.jsonlines


# Verifying the score

In [66]:
def pretty_print_dict(d, indent):
    res = ""
    for k, v in d.items():
        res += "\t"*indent + str(k) + "\n"
        if isinstance(v, dict):
            res += pretty_print_dict(v, indent+1)
        else:
            res += "\t"*(indent+1) + str(v) + "\n"
    print(res)
    return res

def compute_seqeval_jsonl(references_jsonl, predictions_jsonl, ref_col='ner_tags', pred_col='pred_ner_tags'):
    '''
    Computes the seqeval scores between two datasets loaded from jsonl (list of dicts with same keys).
    Sorts the datasets by 'unique_id' and verifies that the tokens match.
    '''
    # extract the tags and reverse the dict
    ref_dict = {k:[e[k] for e in references_jsonl] for k in references_jsonl[0].keys()}
    pred_dict = {k:[e[k] for e in predictions_jsonl] for k in predictions_jsonl[0].keys()}

    # sort by unique_id
    ref_idx = np.argsort(ref_dict['unique_id'])
    pred_idx = np.argsort(pred_dict['unique_id'])
    ref_ner_tags = np.array(ref_dict[ref_col], dtype=object)[ref_idx]
    pred_ner_tags = np.array(pred_dict[pred_col], dtype=object)[pred_idx]
    ref_tokens = np.array(ref_dict['tokens'], dtype=object)[ref_idx]
    pred_tokens = np.array(pred_dict['tokens'], dtype=object)[pred_idx]

    # check that tokens match
    #assert((ref_tokens==pred_tokens).all())


    # get report
    report = classification_report(y_true=ref_ner_tags, y_pred=pred_ner_tags,
                                   scheme=IOB2, output_dict=True,
                                  )

    # extract values we care about
    report.pop("macro avg")
    report.pop("weighted avg")
    overall_score = report.pop("micro avg")

    seqeval_results = {
        type_name: {
            "precision": score["precision"],
            "recall": score["recall"],
            "f1": score["f1-score"],
            "suport": score["support"],
        }
        for type_name, score in report.items()
    }
    seqeval_results["overall_precision"] = overall_score["precision"]
    seqeval_results["overall_recall"] = overall_score["recall"]
    seqeval_results["overall_f1"] = overall_score["f1-score"]
    seqeval_results["overall_accuracy"] = accuracy_score(y_true=ref_ner_tags, y_pred=pred_ner_tags)

    return(seqeval_results)


if __name__ == '__main__':

    # Pour les étudiants : indiquer le chemin vers le fichier NER-VALIDATION
    with open("NER-VALIDATION.jsonlines", 'r') as f:
        references_jsonl = [json.loads(l) for l in list(f)]

    # Pour les étudiants : indiquer ici le chemin vers votre fichier de prédiction sur le jeu de validation
    with open(predicted_file_valid, 'r') as f:
        pred_jsonl = [json.loads(l) for l in list(f)]


    res = compute_seqeval_jsonl(references_jsonl, pred_jsonl, ref_col = 'ner_tags', pred_col='ner_tags')
    pretty_print_dict(res, 0)

	precision
		0.2672772689425479
	recall
		0.7716346153846154
	f1
		0.3970315398886828
	suport
		416

	precision
		0.2049612403100775
	recall
		0.7161430119176598
	f1
		0.31870781099324974
	suport
		923

	precision
		0.30612244897959184
	recall
		0.75
	f1
		0.43478260869565216
	suport
		280

Action
	precision
		0.2672772689425479
	recall
		0.7716346153846154
	f1
		0.3970315398886828
	suport
		416
Entity
	precision
		0.2049612403100775
	recall
		0.7161430119176598
	f1
		0.31870781099324974
	suport
		923
Modifier
	precision
		0.30612244897959184
	recall
		0.75
	f1
		0.43478260869565216
	suport
		280
overall_precision
	0.23317683881064163
overall_recall
	0.7362569487337863
overall_f1
	0.3541821423265488
overall_accuracy
	0.6082697301125187

