<a href="https://colab.research.google.com/github/tiwari-arpit/nlp/blob/main/HindiNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries (Setup)

In [None]:
!pip install datasets
#!pip install -U transformers
!pip install -U accelerate
!pip install seqeval
!pip install evaluate

In [None]:
import pandas as pd

# Dataset

In [None]:
# Dataset source: https://huggingface.co/datasets/ai4bharat/naamapadam

from datasets import load_dataset
lang = 'hi'
hindi_data = load_dataset('ai4bharat/naamapadam',lang)

In [None]:
hindi_data

In [None]:
hindi_data['train'].to_pandas()

In [None]:
tags = hindi_data['train'].features['ner_tags'].feature

def create_tag_name(batch):
  tag_name = {'ner_tags_str': [ tags.int2str(idx) for idx in batch['ner_tags']]}
  return tag_name

In [None]:
hindi_data = hindi_data.map(create_tag_name)

In [None]:
hindi_data['train'].to_pandas().iloc[0]

# Model

**Load Pre-trained Model for Tokenization**

In [None]:
from transformers import AutoTokenizer

# model_checkpoint = 'distilbert-base-cased'
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained("arpit-tiwari/distilbert-finetuned-hindi-ner")

**Tokenize all texts and align the labels with them**

In [None]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word=None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label%2==1:
        label = label + 1
      new_labels.append(label)

  return new_labels

In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'],truncation=True,is_split_into_words=True)
  all_labels = examples['ner_tags']
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels,word_ids))

  tokenized_inputs['labels'] = new_labels
  return tokenized_inputs

In [None]:
tokenized_data = hindi_data.map(tokenize_and_align_labels,batched=True,remove_columns=hindi_data['train'].column_names)

In [None]:
tokenized_data

**Create Data Collator and Metrics**

In [None]:
from transformers import DataCollatorForTokenClassification
data_colator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_colator([tokenized_data['train'][i] for i in range(2)])
batch

In [None]:
import evaluate
metric = evaluate.load('seqeval')

In [None]:
ner_feature = hindi_data['train'].features['ner_tags']
label_names = ner_feature.feature.names

In [None]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [None]:
print(id2label)

In [None]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

**Load Pre-trained Model for Token Classifiaction.**

In [None]:
from transformers import AutoModelForTokenClassification
# model = AutoModelForTokenClassification.from_pretrained(
#     model_checkpoint,
#     id2label=id2label,
#     label2id=label2id,
# )

In [None]:
model.config.num_labels

**Set Training Arguments**

In [None]:
# Training on 50% data for 1 epoch
#from sklearn.model_selection import train_test_split
train_size = 0.5
train_dataset = tokenized_data["train"]

train_dataset = train_dataset.select(range(700000,800000))
train_dataset

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(output_dir = "./distilbert-finetuned-hindi_ner",
                         evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

model = AutoModelForTokenClassification.from_pretrained("./distilbert-finetuned-hindi__ner")

**Training**

In [None]:
from transformers import Trainer

trainer = Trainer(model = model,
                  args = args,
                  train_dataset = train_dataset,
                  eval_dataset = tokenized_data['test'],
                  data_collator = data_colator,
                  compute_metrics = compute_metrics,
                  tokenizer = tokenizer)
trainer.train()

In [None]:
trainer.save_model("./distilbert-finetuned-hindi__ner")

In [None]:
model.save_pretrained('./distilbert-finetuned-hindi__ner')
tokenizer.save_pretrained('./distilbert-finetuned-hindi__ner')

In [None]:
model_name = "./distilbert-finetuned-hindi__ner"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi, HfFolder

model_name = "./distilbert-finetuned-hindi__ner"

api = HfApi()
api.upload_folder(folder_path=model_name, path_in_repo="", repo_id="arpit-tiwari/distilbert-finetuned-hindi-ner", repo_type="model")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="./distilbert-finetuned-hindi__ner",aggregation_strategy="simple")
pipe(("इटली की पीएम जॉर्जिया मेलोनी ने पूरी दुनिया के लेफ्टिस्ट लीडर्स को पाखंडी बताया है। उन्होंने कहा कि दुनियाभर में मोदी, ट्रम्प और मेरे जैसे दक्षिणपंथी नेताओं के उभरने से सारे लेफ्टिस्ट नेता परेशान हो गए हैं|"))