# Named Entity Recognition for Earthquake Tweets

### Install and Import the Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#!pip install seqeval

In [None]:
#!pip install transformers datasets

In [None]:
#!pip install transformers[torch]

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer, pipeline
from datasets import load_dataset
from datasets import load_metric

### Upload the Dataset

In [None]:
jsonl_file_path = 'admin2.jsonl'


texts = []
with open(jsonl_file_path, 'r') as file:
    for line in file:
        data = json.loads(line.strip())
        texts.append(data)

In [None]:
df= pd.DataFrame(texts)
df.head()

In [None]:
df['text'][:5].tolist()

### Clean The Data

Again, since I discovered some preparation requirements along the way, while moving further on the project, there were some necessary steps I realized only after finishing the annotation of the NER tags:
- the dataset I annotated was containing tags and hashtags, however, I realized they will end up being problematic and misleading for the machine learing since I observed people to use hashtags with many cities impacted by the earthquake at the end of their tweets to spread the word (ex: #hatay #malatya #kahramanmaraş) and I would not want the algorithm to confuse this as intentional location information. Similarly, they sometimes tagged the celebrities and government officials to spread the word and I would not want these to be mistaken as PERSON tags. So I decided to remove anything that follows these tags (not just # or @ symbols)
- Yet, there is a problem here: We cannot simply delete the entities following this tags since this would cause misalignment in the labeled dataset. As you see above, the labels were returned in such format: [0,32,ADDR] with numbers indicating the start and end indices of the corresponding string. 
- One solution is to replace ahh tags and hashtags characters with space, this way, the indices will remain the same. We can deal with these spaces later after tokenization.

In [None]:
def cleaned_text(text):
    chars_to_replace = r',/-"()*:'
    clean_text = re.sub(f'[{re.escape(chars_to_replace)}]', ' ', text)

    #remove any entity that follows @ and #
    i = 0
    while i < (len(clean_text)):
        if clean_text[i] == '@' or clean_text[i]== '#':
            a=0
            while i+a < len(clean_text):
                if clean_text[i+a] == ' ':
                    break
                else:
                    a += 1
            clean_text = clean_text[:i] + ' '*(a) + clean_text[i+a:]
        else:
            i+=1
    return clean_text

In [None]:
df2 = df.copy()
df2['text'] = df2['text'].apply(lambda x: cleaned_text(x))
df2['text'][:5].tolist()

In [None]:
df2['text'][0].split(" ")[:30]

In [None]:
df2['label'][0]

In [None]:
print(df2['text'][0][0:32])
print(df2['text'][0][58:68])
print(df2['text'][0][69:74])
print(df2['text'][0][100:112])
print(df2['text'][0][122:133])


## Create Token-Tag Pairs
- The dataset originally have four tags : Person, City, Address and Number. However having noticed that category number is actually misleading for the machine, I decided to suppress that category and convert it to other.
Generating a Token called Other

In [None]:
#i decided to suppress tag Number because it serves little and creates confusion
for i in range(len(df2)):
  for j in range(len(df2['label'][i])) :
    if df2['label'][i][j][2] == 'NUM':
      new_tag = 'OTHER'
      df2['label'][i][j][2] = new_tag

df2['label'][0]

In [None]:
all_token_tag_pairs = []
for i in range(len(df2)):
    all_tokens = df2['text'][i].split(" ")
    tokens_tags = []

    if len(df2['label'][i]) >0 :  #if there is any labeled entity in that sentence
        tokens_tags = []
        j = 0
        end_index = 0
        while j in range(len(df2['label'][i])) :
            start_index = df2['label'][i][j][0]  #start index of the label is the start index of first label-tag
            other_tokens = df2['text'][i][end_index:start_index].split(" ")
            #everything between the end index of previous token and start index of new token would be classified among other tokens

            if start_index > end_index:
                for token in other_tokens :
                    tokens_tags.append((token, "OTHER")) #let's classify all non-labeled tokens as other

            tag = df2['label'][i][j][2]
            end_index = df2['label'][i][j][1]

            labeled_tokens = df2['text'][i][start_index:end_index].split(" ") #we'll repeat the tag for each tokens of that entity

            if tag!= 'OTHER':
              for k in range(len(labeled_tokens)):
                if k == 0: #if k is the first token of that entity
                    tokens_tags.append((labeled_tokens[k],"B-"+tag))
                else:
                    tokens_tags.append((labeled_tokens[k], "I-"+tag))
            else: #if tag is other (former NUM - tags) no need for B- , I-
              for k in range(len(labeled_tokens)):
                tokens_tags.append((labeled_tokens[k], tag))

            j+=1 #move to the next labeled entity
            end_index = df2['label'][i][j-1][1] #update the end index


    else:
        tokens = df2['text'][i].split(" ")
        for token in tokens:
            tokens_tags.append((token, "OTHER"))

    all_token_tag_pairs.append(tokens_tags)







In [None]:
all_token_tag_pairs[0]

We can confirm that the labels I assigned "NUM" during annotation were correctly replaced by "OTHER".

In [None]:
tags = []

for pair in all_token_tag_pairs:
  for token,tag in pair:
    tags.append(tag)

set(tags)

Now that we won't need to the characters indices anymore, every token separately labeled, we can get rid of the empty space tokens.

In [None]:
new_pairs = []
for tweet in all_token_tag_pairs:
    new_tweet = []
    for token,tag in tweet:
        if token != '':
            new_tweet.append((token,tag))
    new_pairs.append(new_tweet)

new_pairs[:30]

## Encoding the Labels to Integers

In [None]:
ner_tags = ["OTHER", 'B-PER', 'I-PER', 'B-CITY', 'I-CITY', 'B-ADDR', 'I-ADDR']
id2label = {k: v for k, v in enumerate(ner_tags)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
id2label

In [None]:
label2id

### Separating tokens and *tags*

In [None]:
inputs = []
targets = []

for tweet_tag_pairs in new_pairs:
    tokens = []
    target = []
    for token, tag in tweet_tag_pairs:
        tokens.append(token)
        target.append(label2id[tag])
    inputs.append(tokens)
    targets.append(target)

In [None]:
inputs[:5]

In [None]:
targets[:5]

### Save the input-tag pairs as a new dataset

In [None]:
with open('NER_deneme.json', 'w') as f:
  for x, y in zip(inputs, targets):
    j = {'inputs': x, 'targets': y}
    s = json.dumps(j)
    f.write(f"{s}\n")

In [None]:
dataset = load_dataset("json", data_files='NER_deneme.json')

In [None]:
dataset

In [None]:
data = dataset['train'].train_test_split(seed=42)
data

### Import the pretrained model we will fine-tune

In [None]:
checkpoint = "akdeniz27/bert-base-turkish-cased-ner"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


 Let's find out how the model's tokenizer will treat the tokens

In [None]:
#ex
t = tokenizer(data['train'][1]['inputs'], is_split_into_words = True)
t

In [None]:
t.tokens()

### Attention:
- As in other transformer models,  the tokenizer generated subword tokens, by splitting the words into sub-units.
- This will cause an alignment problem with our target tags, since now the input data is longer than the targets.
- Therefore we will need to realign the targets.

To this end, we will use the word_ids() method of the Transformer Tokenizer class. This method returns us the value i for each subword token, with i denoting that the corresponding token belongs to the i'th word of the input sentence. Since the first token of each sentence is [CLS] and last token is [SEP], these two receive 'None' as word_ids value, as they do not originally belong to the sentence.

In [None]:
#target alignment:
# value of i indicates it is the i'th word
# in the input sentence (counting from 0)
t.word_ids()

In order to realign the targets, we will make sure that subword tokens that belong to the same word will share the same tag, but in different I or B tags. For example, we know that the first word in a person entity received the B-PER tag, while the following words belonging to the same entity recieved I-PER. This time, we will make sure that after the subword tokenization, even the subword tokens within the first word of a person entity will start with B-PER and continue as I-PER. We don't have any problem if the word's corresponding tag is already I-PER (like a second word in an entity), in that case we can directly replicate the I-PER tag for all sub-units.

Since the change must be done only for the subsequent tokens that are part of the first word of an entity, let's designate a function that will convert from label "B-" to label "I-" for those tokens, and that will replicate label "I-" for the other parts of the entities.

In [None]:
# ["OTHER", 'B-PER', 'I-PER', 'B-CITY', 'I-CITY', 'B-ADDR', 'I-ADDR']

begin2inside = {
  1: 2,
  3: 4,
  5: 6,
}

Note that in NER and POS tagging we often encode as a negative value -100 for the CLS and SEP tokens as a common practice to easily filter them out when needed.

In [None]:
def align_targets(labels, word_ids):
    #word_ids list has the same length as tokenized inputs
    #labels are just the original labels so they have the same length as untokenized inputs
  aligned_labels = []
  last_assigned_word = None
  for word_id in word_ids:
    if word_id is None:
      # it's a token like [CLS]
      label = -100
    elif word_id != last_assigned_word:  #this means the token is not belonging to the previous word
      # it's a new word!
      label = labels[word_id]
    else:
      # it's the same word as before
      #so we should map them to a I-tag not B-tag
      label = labels[word_id]
        #however we should have some way to check this label I- B- cause we can't have more than one B

      # change B-<tag> to I-<tag> if necessary
      if label in begin2inside: #remember label is an id mapped to corresponding tag
        label = begin2inside[label]

    # add the label
    aligned_labels.append(label)

    # update last word
    last_assigned_word = word_id

  return aligned_labels


In [None]:
# try our function for the tokens stored in t, corresponding to input index 1
labels = data['train'][1]['targets'] #these are the non-aligned NER tags for that input
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)


In [None]:
" ".join(data['train'][1]['inputs'])

In [None]:
#let's retrieve the corresponding label from our id2label dictionary:
#the code below will return id2label tags for positive id and None for negative id (-100 in our case)
aligned_labels = [ner_tags[t] if t >= 0 else None for t in aligned_targets] #-100se none alcak

for x, y in zip(t.tokens(), aligned_labels):
  print(f"{x}\t{y}")

All tags seem to have been aligned correctly!

Now we are ready to build a custom tokenizer, which will apply tokenizer to the entire dataset and while aligning the labels accordingly!

In [None]:
# tokenize both inputs and targets
def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids, attention_mask, etc.
  tokenized_inputs = tokenizer(
    batch['inputs'], truncation=True, is_split_into_words=True
  )

  labels_batch = batch['targets'] # original targets
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_targets(labels, word_ids))

  # recall: the 'target' must be stored in key called 'labels'
  tokenized_inputs['labels'] = aligned_labels_batch

  return tokenized_inputs

In [None]:
tokenized_datasets = data.map(
  tokenize_fn,
  batched=True,
  remove_columns=data["train"].column_names, #this is in case the data contain columns other than input and labels
)

In [None]:
tokenized_datasets

### Data Collator

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
metric = load_metric("seqeval")

In [None]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis=-1)

  # remove -100 from labels and predictions
  # and convert the label_ids to label names
  str_labels = [
    [id2label[t] for t in label if t != -100] for label in labels
  ] #only returns the labels for id != -100

  # do the same for predictions whenever true label is -100,
  # because we don't want our models predictions over the CLS and SEP tags (which will likely be %100 correct
  # to bias and overestimate the performance of our model)

  str_preds = [
    [id2label[p] for p, t in zip(pred, targ) if t != -100] for pred, targ in zip(preds, labels)
  ] #for every case that the original label is -100, the prediction will be removed without being taken into account

  the_metrics = metric.compute(predictions=str_preds, references=str_labels)
    #from here we format the result into a new dictionary
  return {
    'precision': the_metrics['overall_precision'],
    'recall': the_metrics['overall_recall'],
    'f1': the_metrics['overall_f1'],
    'accuracy': the_metrics['overall_accuracy'],
  }


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    #ignore_mismatched_sizes=True,
    #there is no longer need for this after suppressing NUM label, now the number of labels in our data
    #matches the number of labels in the pre-trained model
)

In [None]:
training_args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=6,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator, ### the only difference from previous section
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer.save_model('my_saved_model')

### Building Pipeline and Testing the Model
Notice that I name this first pipeline pipe_none, referring to the fact that I did not choose any aggregation strategy and I will receive the predictions for subword units.

Let's see how this model will perform on the train and validation sets.

In [None]:
pipe_none = pipeline(
  "token-classification",
  model='my_saved_model',
  device=0,
)

In [None]:
def flatten(list_of_lists):
  flattened = [item for sublist in list_of_lists for item in sublist]
  return flattened


We know that our inputs are tags are normally formatted as list of lists, where each sublist in the main list corresponds to input sentences and each item in the sublist corresponds to tokens.

To evaluate the performance metrics, I will define a flatten function that would convert these nested lists into a single list. And then I will apply the metrics defined above to assess model performances.

### Attention:
Note that there is a degree of naivety with this approach. For each metric, it takes into account the number of False Positives, False Negatives, True Positives and True Negatives in the sample, and returns us accuracy, f1, precision and recall scores.

This approach relies on the assumption that the predictions on the labels assigned to the tokens are independent from each other and thus can reflect the model performance. However, this is rarely the case, since the model's prediction of a previous token likely to affect its prediction on the next token, this might especially be case for the subword tokens belonging to the same word. Therefore, in reality, predictions can be interdependent and chaning one can lead the model to change several others too.

But for the sake of simplicity, I will neglect this bias in the performance metrics and take them as an acceptable indicator of the performance.

In [None]:
val_pred_none = pipe_none(flatten(data['test']['inputs']))
train_pred_none = pipe_none(flatten(data['train']['inputs']))
val_pred_none[:20]

In [None]:
target_tags_val = flatten(tokenized_datasets["test"]['labels'])
target_tags_train = flatten(tokenized_datasets["train"]['labels'])
print("The number of the NER tags in the training set:", len(target_tags_train))
print("The number of the NER tags in the validation set:", len(target_tags_val))

Remember that we had inserted the tags for CLS and SEP tokens in order to be able to train the model, yet these are not present in the predictions the model returns nor do we want them to be as we don't want them to bias our performance metrics. So let's drop tags -100 from our actual targets.

In [None]:
print("The number of -100 tags we inserted in the training set:", target_tags_train.count(-100))
print("The number of -100 tags we inserted in the validation set:", target_tags_val.count(-100))


In [None]:
targ_train_labels = [x for x in target_tags_train if x != -100]
targ_val_labels = [x for x in target_tags_val if x != -100]
print("New number of the NER tags in the training set:", len(targ_train_labels))
print("New number of the NER tags in the validation set:", len(targ_val_labels))


We have confirmed that the length of targets and predictions match both for train and valdiation set. Now let's check out the metric scores.

In [None]:
pred_labels_train = []
for item in train_pred_none:
  for j in item:
    pred_labels_train.append(label2id[j['entity']])

pred_labels_val = []
for item in val_pred_none:
  for j in item:
    pred_labels_val.append(label2id[j['entity']])

print("The number of predicted labels for the training set:", len(pred_labels_train))
print("The number of predicted labels for the validation set:", len(pred_labels_val))

In [None]:
print("train_accuracy:", accuracy_score(targ_train_labels, pred_labels_train))
print("train_f1:", f1_score(targ_train_labels, pred_labels_train, average='macro'))

print("val_accuracy:", accuracy_score(targ_val_labels, pred_labels_val))
print("val_f1:", f1_score(targ_val_labels, pred_labels_val, average='macro'))


Let's visualize the precision and recall scores for each label in a table.


In [None]:
print("Train Set Precision and Recall")
pd.concat([pd.Series(id2label.values()),
          pd.Series(precision_score(targ_train_labels, pred_labels_train, average=None)),
          pd.Series(recall_score(targ_train_labels, pred_labels_train, average=None))], keys = ['tags', 'precision', 'recall'], axis=1)


In [None]:
print("Validation Set Precision and Recall")
pd.concat([pd.Series(id2label.values()),
          pd.Series(precision_score(targ_val_labels, pred_labels_val, average=None)),
          pd.Series(recall_score(targ_val_labels, pred_labels_val, average=None))], keys = ['tags', 'precision', 'recall'], axis=1)


In [None]:
classes = ner_tags
def plot_cm(ax, cm, title):
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)
    sns.heatmap(df_cm, annot=True, fmt='.2f', ax = ax, cmap= 'YlGnBu')
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Target")
    ax.set_title(title)

In [None]:
P_train = [id2label[x] for x in pred_labels_train ]
Y_train = [id2label[x] for x in targ_train_labels ]
P_test = [id2label[x] for x in pred_labels_val ]
Y_test = [id2label[x] for x in targ_val_labels ]

In [None]:
cm_train = confusion_matrix(Y_train,P_train, normalize='true')
cm_val = confusion_matrix(Y_test,P_test, normalize='true')


fig, axes = plt.subplots(1, 2, figsize=(9,4))
plot_cm(axes[0], cm_train, title='Confusion Matrix for Train Set')
plot_cm(axes[1], cm_val, title='Confusion Matrix for Validation Set')

plt.tight_layout()
plt.show()

In [None]:
aggr_tags = ['OTHER', 'PER', 'CITY', 'ADDR']
id2label_aggr = {k: v for k, v in enumerate(aggr_tags)}
label2id_aggr = {v: k for k, v in id2label_aggr.items()}
label2id_aggr

In [None]:
aggr_pred_train = []
for i in range(len(pred_labels_train)):
  if pred_labels_train[i] == 0:
    aggr_pred_train.append(0)
  else:
    tag = id2label[pred_labels_train[i]][2:]
    aggr_pred_train.append(aggr_tags.index(tag))

aggr_pred_val = []
for i in range(len(pred_labels_val)):
  if pred_labels_val[i] == 0:
    aggr_pred_val.append(0)
  else:
    tag = id2label[pred_labels_val[i]][2:]
    aggr_pred_val.append(aggr_tags.index(tag))


In [None]:
aggr_target_train = []
for i in range(len(targ_train_labels)):
  if targ_train_labels[i] == 0:
    aggr_target_train.append(0)
  else:
    tag = id2label[targ_train_labels[i]][2:]
    aggr_target_train.append(aggr_tags.index(tag))

aggr_target_val = []
for i in range(len(targ_val_labels)):
  if targ_val_labels[i] == 0:
    aggr_target_val.append(0)
  else:
    tag = id2label[targ_val_labels[i]][2:]
    aggr_target_val.append(aggr_tags.index(tag))


In [None]:
print("train_accuracy:", accuracy_score(aggr_target_train, aggr_pred_train))
print("train_f1:", f1_score(aggr_target_train, aggr_pred_train, average='macro'))

print("val_accuracy:", accuracy_score(aggr_target_val, aggr_pred_val))
print("val_f1:", f1_score(aggr_target_val, aggr_pred_val, average='macro'))


In [None]:
P_train = [id2label[x] for x in aggr_pred_train ]
Y_train = [id2label[x] for x in aggr_target_train ]
P_test = [id2label[x] for x in aggr_pred_val ]
Y_test = [id2label[x] for x in aggr_target_val]

classes=aggr_tags

cm_train = confusion_matrix(Y_train,P_train, normalize='true')
cm_val = confusion_matrix(Y_test,P_test, normalize='true')


fig, axes = plt.subplots(1, 2, figsize=(9,4))
plot_cm(axes[0], cm_train, title='Confusion Matrix for Train Set')
plot_cm(axes[1], cm_val, title='Confusion Matrix for Validation Set')

plt.tight_layout()
plt.show()

In [None]:
join_tweets = []
for tweet in data['test']['inputs']:
  joined = ' '.join(tweet)
  join_tweets.append(joined)

predictions = []
false_predictions = []

for tweet in join_tweets:
  result = pipe_none(tweet)
  tweet_preds = []
  tweet_preds.append(-100)
  for token in result:
    tweet_preds.append(label2id[token['entity']])
  tweet_preds.append(-100)
  predictions.append(tweet_preds)

  if tweet_preds != tokenized_datasets['test']['labels'][join_tweets.index(tweet)]:
    false_predictions.append(tweet)


In [None]:
print("The number of false predited tags in the test set is", len(false_predictions))

In [None]:
false_results = pipe_none(false_predictions)

In [None]:
N = 3
for item in false_results[N]:
  print(item['entity'], item['word'])

In [None]:
N = 2
for item in false_results[N]:
  print(item['entity'], item['word'])

In [None]:
N = 90
for item in false_results[N]:
  print(item['entity'], item['word'])

In [None]:
pipe_first = pipeline(
  "token-classification",
  model='my_saved_model',
  aggregation_strategy = "first",
  device=0,
)


In [None]:
pipe_first(false_predictions[3])

In [None]:
pipe_first(false_predictions[90])

In [None]:
!pip install huggingface_hub

In [None]:
import huggingface_hub

In [None]:
!huggingface-cli login
!huggingface-cli repo create turkish-earthquake-tweets-ner

In [None]:
!huggingface-cli login
!huggingface-cli repo create turkish-earthquake-tweets-ner-bert

In [None]:
huggingface_hub.upload_folder(folder_path='my_saved_model',
    repo_id="yhaslan/turkish-earthquake-tweets-ner",
    repo_type="model"
)

In [None]:
huggingface_hub.upload_folder(folder_path='my_saved_model',
    repo_id="yhaslan/turkish-earthquake-tweets-ner-berturk",
    repo_type="model"
)