In [1]:
'''
WNUT 17: Emerging and Rare entity recognition

This shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions. Named entities form the basis of many modern approaches to other tasks (like event clustering and summarisation), but recall on them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms. Take for example the tweet “so.. kktny in 30 mins?” - even human experts find entity kktny hard to detect and resolve. This task will evaluate the ability to detect and classify novel, emerging, singleton named entities in noisy text.

The goal of this task is to provide a definition of emerging and of rare entities, and based on that, also datasets for detecting these entities.


'''

'\nWNUT 17: Emerging and Rare entity recognition\n\nThis shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions. Named entities form the basis of many modern approaches to other tasks (like event clustering and summarisation), but recall on them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms. Take for example the tweet “so.. kktny in 30 mins?” - even human experts find entity kktny hard to detect and resolve. This task will evaluate the ability to detect and classify novel, emerging, singleton named entities in noisy text.\n\nThe goal of this task is to provide a definition of emerging and of rare entities, and based on that, also datasets for detecting these entities.\n\n\n'

In [2]:
# https://huggingface.co/datasets/wnut_17   (Dataset Link)

In [3]:
!pip install accelerate>=0.20.1

In [4]:
!pip install transformers[torch]



In [5]:
!pip install datasets



In [6]:
!pip install -q evaluate seqeval

In [7]:
from datasets import load_dataset
wnut = load_dataset("wnut_17")
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [8]:
print(wnut["train"][0])

{'id': '0', 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]}


In [9]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [11]:
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words = True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']


In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenized_wnut = wnut.map( tokenize_and_align_labels, batched = True)

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

In [14]:
## padding
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [15]:
#!pip install -q evaluate seqeval

In [16]:
import evaluate
seqeval = evaluate.load("seqeval")

In [17]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
}

In [18]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [19]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
    }
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [20]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels = 13,
    id2label = id2label,
    label2id= label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
!pip install transformers[torch]



In [22]:
!pip install transformers
!pip install accelerate>=0.21.0



In [23]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "my_ner_model",
    learning_rate = 2e-5, # we can change it
    per_device_train_batch_size = 1, # we can change it
    per_device_eval_batch_size=1, # we can change it
    num_train_epochs = 2,   # we can change it
    weight_decay = 0.01, # we can change it
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

In [27]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    train_dataset = tokenized_wnut["train"],
    eval_dataset = tokenized_wnut["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    args = training_args,
    compute_metrics = compute_metrics,  # Including the compute_metrics function
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
## Inference ( optional: You can remove 3 cells coming as follows:)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, AutoModelForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

# Load dataset
wnut = load_dataset("wnut_17")

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=13,  # Assuming 13 labels in your NER task
    id2label = {
        0: "O",
        1: "B-corporation",
        2: "I-corporation",
        3: "B-creative-work",
        4: "I-creative-work",
        5: "B-group",
        6: "I-group",
        7: "B-location",
        8: "I-location",
        9: "B-person",
        10: "I-person",
        11: "B-product",
        12: "I-product",
    },
    label2id = {
        "O": 0,
        "B-corporation": 1,
        "I-corporation": 2,
        "B-creative-work": 3,
        "I-creative-work": 4,
        "B-group": 5,
        "I-group": 6,
        "B-location": 7,
        "I-location": 8,
        "B-person": 9,
        "I-person": 10,
        "B-product": 11,
        "I-product": 12,
    }
)

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Map tokenize_and_align_labels function to the dataset
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

# Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir="my_ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define custom evaluation function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = []
    true_labels = []
    for prediction, label in zip(predictions, labels):
        true_predictions.extend([label_list[p] for p, l in zip(prediction, label) if l != -100])
        true_labels.extend([label_list[l] for p, l in zip(prediction, label) if l != -100])
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average="weighted")
    acc = accuracy_score(true_labels, true_predictions)
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": acc,
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()

print("Evaluation Results:")
print(evaluation_results)


In [None]:
!pip install hmmlearn

In [None]:
!pip install sklearn-crfsuite

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from hmmlearn import hmm
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import numpy as np

import numpy as np
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assume you have tokenized sentences and corresponding NER tags in a format like [(tokens1, tags1), (tokens2, tags2), ...]
tokenized_sentences = wnut["train"]["tokens"]
ner_tags = wnut["train"]["ner_tags"]

# Combine tokenized sentences and corresponding NER tags into a list of tuples
train_data = list(zip(tokenized_sentences, ner_tags))

# Flatten the data for HMM training
flat_tokens = [token for sentence in train_data for token in sentence[0]]
flat_tags = [tag for sentence in train_data for tag in sentence[1]]

# Get unique NER tags from the training data
unique_tags = set(tag for _, tags in train_data for tag in tags)

# Define the number of components for HMM as the number of unique NER tags
num_components = len(unique_tags)

# Create a mapping from tags to indices
tag_to_index = {tag: i for i, tag in enumerate(unique_tags)}
flat_tags_indices = [tag_to_index[tag] for tag in flat_tags]

# Convert tokens to indices
vocab = set(token for sentence in train_data for token in sentence[0])
token_to_index = {token: i for i, token in enumerate(vocab)}
flat_tokens_indices = [token_to_index[token] for token in flat_tokens]

# Convert tokens to a 2D array with one feature per token
flat_tokens_2d = np.array([flat_tokens_indices]).T

# Define the lengths array based on the number of tokens in each sequence
lengths = [len(tokens) for tokens, _ in train_data]

# Initialize and train the HMM model
hmm_model = hmm.MultinomialHMM(n_components=num_components, n_iter=100)
hmm_model.fit(flat_tokens_2d, lengths)

# Predict using the trained HMM model
predicted_tags_indices = hmm_model.predict(flat_tokens_2d)

# Convert predicted indices back to tags
predicted_tags = [list(tag_to_index.keys())[list(tag_to_index.values()).index(tag_index)] for tag_index in predicted_tags_indices]

# Evaluate the model
print(classification_report(flat_tags, predicted_tags))



from transformers import pipeline

text = "My name is Sarah, I live in London and New York with Obama"
classifier = pipeline("ner", model="/content/my_ner_model/checkpoint-6788")
classifier(text)

In [None]:
import pandas as pd

def tag_sentence(text:str):
    # convert our text to a  tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cuda")
    # get outputs
    outputs = model(**inputs)
    # convert to probabilities with softmax
    probs = outputs[0][0].softmax(1)
    # get the tags with the highest probability
    word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2label[tagid.item()])
                  for i, tagid in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(word_tags, columns=['word', 'tag'])

In [None]:
print(tag_sentence(text))

In [None]:
## End of word
# All is done!