# Imports

In [1]:
import json
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os

In [13]:
# File paths
train_file = 'NER-TRAINING.jsonlines'
validation_file = 'NER-VALIDATION.jsonlines'
test_file = 'NER-TESTING.jsonlines'
predicted_file = 'ner-testing-predictions.jsonlines'
predicted_file_2 = 'ner-validation-predictions.jsonlines'

# Prepare the datasets

In [3]:
# Helper function to read JSON Lines file
def read_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

# Helper function to write JSON Lines file
def write_jsonl(file_path, data):
    with open(file_path, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")

# Clean data by removing entries with only 'O' tags
def clean_data(dataset):
    return [entry for entry in dataset if any(tag != 'O' for tag in entry.get('ner_tags', []))]

# Extract features for each token
def extract_features(tokens):
    features = []
    for i, token in enumerate(tokens):
        token_features = {
            'token': token,
            'is_upper': token.isupper(),
            'is_title': token.istitle(),
            'is_digit': token.isdigit(),
            'prefix-1': token[:1],
            'prefix-2': token[:2],
            'suffix-1': token[-1:],
            'suffix-2': token[-2:],
            'length': len(token),
            'position': i,
        }
        features.append(token_features)
    return features

# Prepare training data
def prepare_training_data(dataset):
    X, y = [], []
    for entry in dataset:
        tokens = entry['tokens']
        ner_tags = entry['ner_tags']
        features = extract_features(tokens)
        X.extend(features)
        y.extend(ner_tags)
    return X, y

In [6]:
# Load datasets
train_data = read_jsonl(train_file)
validation_data = read_jsonl(validation_file)
test_data = read_jsonl(test_file)

# Clean training and validation data
train_data = clean_data(train_data)
validation_data = clean_data(validation_data)

# Prepare features and labels
X_train, y_train = prepare_training_data(train_data)
X_valid, y_valid = prepare_training_data(validation_data)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

    B-Action       0.58      0.38      0.46       416
    B-Entity       0.55      0.56      0.56       923
  B-Modifier       0.52      0.36      0.43       280
    I-Action       0.43      0.24      0.31       110
    I-Entity       0.66      0.65      0.66      2907
  I-Modifier       0.00      0.00      0.00        22
           O       0.60      0.66      0.63      3046

    accuracy                           0.61      7704
   macro avg       0.48      0.41      0.43      7704
weighted avg       0.61      0.61      0.61      7704



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Train the model

In [None]:
# Build the pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LogisticRegression(max_iter=10000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_valid = pipeline.predict(X_valid)
print(classification_report(y_valid, y_pred_valid))

# Make the predictions

In [17]:
# Predict for data of choice
def predict_for_data(dataset, pipeline):
    predictions = []
    for entry in dataset:
        tokens = entry['tokens']
        features = extract_features(tokens)
        predicted_tags = pipeline.predict(features)
        predictions.append({
            'unique_id': entry['unique_id'],
            'tokens': tokens,
            'ner_tags': predicted_tags.tolist()
        })
    return predictions

# PREDICTIONS ON TEST DATA
# predictions = predict_for_data(test_data, pipeline)
# write_jsonl(predicted_file, predictions) # write predictions to json file
# print(f"Predictions saved to {predicted_file}")

# PREDICTIONS ON VALIDATION DATA
validation_data_2 = read_jsonl(validation_file)
predictions = predict_for_data(validation_data_2, pipeline)
write_jsonl(predicted_file_2, predictions) # write predictions to json file
print(f"Predictions saved to {predicted_file_2}")

Predictions saved to ner-validation-predictions.jsonlines


# Evaluation scores

In [18]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from seqeval.scheme import IOB2
import numpy as np
import json

def pretty_print_dict(d, indent):
    res = ""
    for k, v in d.items():
        res += "\t"*indent + str(k) + "\n"
        if isinstance(v, dict):
            res += pretty_print_dict(v, indent+1)
        else:
            res += "\t"*(indent+1) + str(v) + "\n"
    print(res)
    return res

def compute_seqeval_jsonl(references_jsonl, predictions_jsonl, ref_col='ner_tags', pred_col='pred_ner_tags'):
    '''
    Computes the seqeval scores between two datasets loaded from jsonl (list of dicts with same keys).
    Sorts the datasets by 'unique_id' and verifies that the tokens match.
    '''
    # extract the tags and reverse the dict
    ref_dict = {k:[e[k] for e in references_jsonl] for k in references_jsonl[0].keys()}
    pred_dict = {k:[e[k] for e in predictions_jsonl] for k in predictions_jsonl[0].keys()}

    # sort by unique_id
    ref_idx = np.argsort(ref_dict['unique_id'])
    pred_idx = np.argsort(pred_dict['unique_id'])
    ref_ner_tags = np.array(ref_dict[ref_col], dtype=object)[ref_idx]
    pred_ner_tags = np.array(pred_dict[pred_col], dtype=object)[pred_idx]
    ref_tokens = np.array(ref_dict['tokens'], dtype=object)[ref_idx]
    pred_tokens = np.array(pred_dict['tokens'], dtype=object)[pred_idx]

    # check that tokens match
    #assert((ref_tokens==pred_tokens).all())


    # get report
    report = classification_report(y_true=ref_ner_tags, y_pred=pred_ner_tags,
                                   scheme=IOB2, output_dict=True,
                                  )

    # extract values we care about
    report.pop("macro avg")
    report.pop("weighted avg")
    overall_score = report.pop("micro avg")

    seqeval_results = {
        type_name: {
            "precision": score["precision"],
            "recall": score["recall"],
            "f1": score["f1-score"],
            "suport": score["support"],
        }
        for type_name, score in report.items()
    }
    seqeval_results["overall_precision"] = overall_score["precision"]
    seqeval_results["overall_recall"] = overall_score["recall"]
    seqeval_results["overall_f1"] = overall_score["f1-score"]
    seqeval_results["overall_accuracy"] = accuracy_score(y_true=ref_ner_tags, y_pred=pred_ner_tags)

    return(seqeval_results)


if __name__ == '__main__':

    # Pour les étudiants : indiquer le chemin vers le fichier NER-VALIDATION
    with open("NER-VALIDATION.jsonlines", 'r') as f:
        references_jsonl = [json.loads(l) for l in list(f)]

    # Pour les étudiants : indiquer ici le chemin vers votre fichier de prédiction sur le jeu de validation
    with open(predicted_file_2, 'r') as f:
        pred_jsonl = [json.loads(l) for l in list(f)]


    res = compute_seqeval_jsonl(references_jsonl, pred_jsonl, ref_col = 'ner_tags', pred_col='ner_tags')
    pretty_print_dict(res, 0)

	precision
		0.21887287024901703
	recall
		0.4014423076923077
	f1
		0.28329092451229854
	suport
		416

	precision
		0.0668328400062315
	recall
		0.4647887323943662
	f1
		0.11686189049305366
	suport
		923

	precision
		0.19186046511627908
	recall
		0.3535714285714286
	f1
		0.24874371859296485
	suport
		280

Action
	precision
		0.21887287024901703
	recall
		0.4014423076923077
	f1
		0.28329092451229854
	suport
		416
Entity
	precision
		0.0668328400062315
	recall
		0.4647887323943662
	f1
		0.11686189049305366
	suport
		923
Modifier
	precision
		0.19186046511627908
	recall
		0.3535714285714286
	f1
		0.24874371859296485
	suport
		280
overall_precision
	0.09028319043907508
overall_recall
	0.42927733168622606
overall_f1
	0.14918965332188472
overall_accuracy
	0.54665984735227

