<a href="https://colab.research.google.com/github/tiwari-arpit/nlp/blob/main/HindiNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries (Setup)

In [None]:
!pip install datasets
#!pip install -U transformers
!pip install -U accelerate
!pip install seqeval
!pip install evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import pandas as pd

# Dataset

In [None]:
# Dataset source: https://huggingface.co/datasets/ai4bharat/naamapadam

from datasets import load_dataset
lang = 'hi'
hindi_data = load_dataset('ai4bharat/naamapadam',lang)

In [None]:
hindi_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 985787
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 867
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13460
    })
})

In [None]:
hindi_data['train'].to_pandas()

Unnamed: 0,tokens,ner_tags
0,"[सेक्टर, 55/56, के, एसएचओ, अरविंद, कुमार, ने, ...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[मध्य, रेलवे, एवं, पश्चिम, रेलवे, के, अधिकारिय...","[3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 3, 4, 0, 3, ..."
2,"[जाने, -, माने, वैज्ञानिक, सिवान, के, ., को, भ...","[0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 4, 4, 0, 0, 0, ..."
3,"[यह, सूर्य, ग्रहण, भारत, में, भी, दिखेगा, .]","[0, 0, 0, 5, 0, 0, 0, 0]"
4,"[ज्ञापन, में, कहा, गया, है, कि, सीपीडब्ल्यूडी,...","[0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
985782,"[पाकिस्तानी, टीवी, होस्ट, नादिया, खान, ने, हॉल...","[0, 0, 0, 1, 2, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
985783,"[भारतीय, आयात, -, निर्यात, बैंक, ,, भारतीय, स्...","[0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 3, 4, 4, 0, 3, ..."
985784,"[दिल्ली, की, पटियाला, हाउस, अदालत, ने, IRCTC, ...","[5, 0, 3, 4, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 5, ..."
985785,"[आधारभूत, सुविधाओं, के, कार्यदल, ने, भारतीय, ह...","[0, 3, 4, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
tags = hindi_data['train'].features['ner_tags'].feature

def create_tag_name(batch):
  tag_name = {'ner_tags_str': [ tags.int2str(idx) for idx in batch['ner_tags']]}
  return tag_name

In [None]:
hindi_data = hindi_data.map(create_tag_name)

In [None]:
hindi_data['train'].to_pandas().iloc[0]

Unnamed: 0,0
tokens,"[सेक्टर, 55/56, के, एसएचओ, अरविंद, कुमार, ने, ..."
ner_tags,"[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ner_tags_str,"[O, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O..."


# Model

**Load Pre-trained Model for Tokenization**

In [None]:
from transformers import AutoTokenizer

# model_checkpoint = 'distilbert-base-cased'
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained("arpit-tiwari/distilbert-finetuned-hindi-ner")

**Tokenize all texts and align the labels with them**

In [None]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word=None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label%2==1:
        label = label + 1
      new_labels.append(label)

  return new_labels

In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'],truncation=True,is_split_into_words=True)
  all_labels = examples['ner_tags']
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels,word_ids))

  tokenized_inputs['labels'] = new_labels
  return tokenized_inputs

In [None]:
tokenized_data = hindi_data.map(tokenize_and_align_labels,batched=True,remove_columns=hindi_data['train'].column_names)

In [None]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 985787
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 867
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13460
    })
})

**Create Data Collator and Metrics**

In [None]:
from transformers import DataCollatorForTokenClassification
data_colator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_colator([tokenized_data['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,   100,  3731,   120,  4376,   607, 28535,   100,   100,   607,
         28534, 28522, 28531, 28524,   615, 28535,   617, 28515, 28531, 28523,
         28531,   607, 28532,   100,   619, 28531, 28522, 28525, 28535,   619,
         28535, 28508,   100,   607, 28533,   614, 28531, 28524, 28531,  3413,
          1545,   118,   100,   113,   100,   114,   607, 28535,   612, 28530,
         28515,   619, 28531, 28522, 28525, 28531,   613, 28524, 28537, 28513,
           607, 28524,   622, 28532, 28523, 28531,   608, 28523, 28531,   100,
           635,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,   619, 28517, 28537, 28523,   621, 28535, 28525, 28526, 28535,
           100,   616, 28527, 28537, 28512, 28532, 28522,   621, 28535, 28525,
         28526, 28535,   607, 28535,   100,   607, 28535,   100,   100,   118,
           100,   100,   117,   608, 

In [None]:
import evaluate
metric = evaluate.load('seqeval')

In [None]:
ner_feature = hindi_data['train'].features['ner_tags']
label_names = ner_feature.feature.names

In [None]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [None]:
print(id2label)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}


In [None]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

**Load Pre-trained Model for Token Classifiaction.**

In [None]:
from transformers import AutoModelForTokenClassification
# model = AutoModelForTokenClassification.from_pretrained(
#     model_checkpoint,
#     id2label=id2label,
#     label2id=label2id,
# )

In [None]:
model.config.num_labels

7

**Set Training Arguments**

In [None]:
# Training on 50% data for 1 epoch
#from sklearn.model_selection import train_test_split
train_size = 0.5
train_dataset = tokenized_data["train"]

train_dataset = train_dataset.select(range(700000,800000))
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100000
})

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(output_dir = "./distilbert-finetuned-hindi_ner",
                         evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

model = AutoModelForTokenClassification.from_pretrained("./distilbert-finetuned-hindi__ner")



**Training**

In [None]:
from transformers import Trainer

trainer = Trainer(model = model,
                  args = args,
                  train_dataset = train_dataset,
                  eval_dataset = tokenized_data['test'],
                  data_collator = data_colator,
                  compute_metrics = compute_metrics,
                  tokenizer = tokenizer)
trainer.train()

  trainer = Trainer(model = model,


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.24,0.194999,0.616803,0.624158,0.620459,0.930714
2,0.1998,0.193624,0.646255,0.612753,0.629058,0.93448
3,0.1834,0.208736,0.598992,0.616382,0.607563,0.929796


TrainOutput(global_step=37500, training_loss=0.21455697875976562, metrics={'train_runtime': 3964.1, 'train_samples_per_second': 75.679, 'train_steps_per_second': 9.46, 'total_flos': 9694828092729648.0, 'train_loss': 0.21455697875976562, 'epoch': 3.0})

In [None]:
trainer.save_model("./distilbert-finetuned-hindi__ner")

In [None]:
model.save_pretrained('./distilbert-finetuned-hindi__ner')
tokenizer.save_pretrained('./distilbert-finetuned-hindi__ner')

('./distilbert-finetuned-hindi__ner/tokenizer_config.json',
 './distilbert-finetuned-hindi__ner/special_tokens_map.json',
 './distilbert-finetuned-hindi__ner/vocab.txt',
 './distilbert-finetuned-hindi__ner/added_tokens.json',
 './distilbert-finetuned-hindi__ner/tokenizer.json')

In [None]:
model_name = "./distilbert-finetuned-hindi__ner"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi, HfFolder

model_name = "./distilbert-finetuned-hindi__ner"

api = HfApi()
api.upload_folder(folder_path=model_name, path_in_repo="", repo_id="arpit-tiwari/distilbert-finetuned-hindi-ner", repo_type="model")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="./distilbert-finetuned-hindi__ner",aggregation_strategy="simple")
pipe(("इटली की पीएम जॉर्जिया मेलोनी ने पूरी दुनिया के लेफ्टिस्ट लीडर्स को पाखंडी बताया है। उन्होंने कहा कि दुनियाभर में मोदी, ट्रम्प और मेरे जैसे दक्षिणपंथी नेताओं के उभरने से सारे लेफ्टिस्ट नेता परेशान हो गए हैं|"))

Device set to use cuda:0


[{'entity_group': 'LOC',
  'score': np.float32(0.3829023),
  'word': 'इटली',
  'start': 0,
  'end': 4},
 {'entity_group': 'PER',
  'score': np.float32(0.9290694),
  'word': 'जॉर्जिया मेलोनी',
  'start': 13,
  'end': 28},
 {'entity_group': 'PER',
  'score': np.float32(0.97446954),
  'word': 'मोदी',
  'start': 113,
  'end': 117}]