In [1]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=e3df4599c54e28363f562f5fd79aec648b875643744cd0791ffef34984d29adc
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


## Running the IndicNER Model
Let's try annotating some Indian language sentences and get the named entities

In [2]:
# Import all the necessary classes and initialize the tokenizer and model.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")

model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [3]:
def get_predictions( sentence, tokenizer, model ):
  # Let us first tokenize the sentence - split words into subwords
  tok_sentence = tokenizer(sentence, return_tensors='pt') # argument return_tensors='pt' indicates return type is pytorch tensors

  with torch.no_grad():
    # we will send the tokenized sentence to the model to get predictions
    logits = model(**tok_sentence).logits.argmax(-1) #once logits are found of tokenized sentence, argmax finds the maximum value along last dimension for each token in the sentence.

    # We will map the maximum predicted class id with the class label
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]
#     print(predicted_tokens_classes)
    predicted_labels = []

    previous_token_id = 0
    # we need to assign the named entity label to the head word and not the following sub-words
    word_ids = tok_sentence.word_ids()
    for word_index in range(len(word_ids)):
        if word_ids[word_index] == None:
            previous_token_id = word_ids[word_index]
        elif word_ids[word_index] == previous_token_id:
            previous_token_id = word_ids[word_index]
        else:
            predicted_labels.append( predicted_tokens_classes[ word_index ] )
            previous_token_id = word_ids[word_index]

    return predicted_labels

In [4]:
# let us try with some example sentences here
sentence = 'लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं'

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )

लगातार	O
हमलावर	O
हो	O
रहे	O
शिवपाल	B-PER
और	O
राजभर	B-PER
को	O
सपा	B-ORG
की	O
दो	O
टूक,	O
चिट्ठी	O
जारी	O
कर	O
कहा-	O
जहां	O
जाना	O
चाहें	O
जा	O
सकते	O
हैं	O


## Naampadam Dataset for hindi Language
The Naampadam Dataset is a large dataset for Named Entity Recognition in 11 Indian languages. Naampadam means "named entity" in Sanskrit.

Here I have used Hindi Dataset

In [5]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='hi'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

Downloading builder script:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

Downloading and preparing dataset naamapadam_pr/hi to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/hi/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20...


Downloading data:   0%|          | 0.00/82.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset naamapadam_pr downloaded and prepared to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/hi/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# let's now print how the Dataset looks like
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 985787
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 867
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13460
    })
})

In [7]:
raw_datasets.column_names

{'train': ['tokens', 'ner_tags'],
 'test': ['tokens', 'ner_tags'],
 'validation': ['tokens', 'ner_tags']}

In [8]:
# let's print an instance of dataset
idx=985786 # last statement in training set.
rec=raw_datasets['train'][idx]
for w, t in zip(rec['tokens'],rec['ner_tags']): # zip() function iterates over the given two lists simultaneously
  print('{}\t{}'.format(w,t))


उनके	0
27	0
साल	0
के	0
बेटे	0
जीवा	1
को	0
भी	0
दिल	0
की	0
बीमारी	0
थी	0
।	0


In [9]:
column_names = raw_datasets["train"].column_names
print(column_names) # names of the columns in dataset

features = raw_datasets["train"].features
print(features)

['tokens', 'ner_tags']
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [10]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [11]:
# this shows the ClassLabel objects mapping to numbers

label_list = features[label_column_name].feature.names # gets the lables list

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))} # mapping of lables to id.

print(label_to_id)

num_labels = len(label_list)


{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


## Pre-Processing before training
### Tokenize the dataset and align tokens with their corresponding NER tags

In [15]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples, tokenizer): # added tokenizer as an argument
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

taking 20000 instances of training data

In [17]:
rtrain_dataset = raw_datasets['train'].select(range(20000))

In [18]:
!pip install -U accelerate # for hardware optimization
!pip install transformers[torch] # ensuring all the dependencies for running PyTorch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




## Fine Tunning indicBERT for NER task using Hindi Nampadam dataset
We have already seen how to get predictions from fine-tuned NER model. We will now use the pre-trained IndicBERT model and fine-tune it for NER task.

Let us download a pre-trained model and fine-tune it for the task of NER. We will have to use the AutoModelForTokenClassification class to fine-tune the model

### Load Pre-trained Model

In [19]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indic_model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )

2024-03-13 09:23:26.118590: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 09:23:26.118689: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 09:23:26.239924: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# aligning test dataset in token:ner_tag pair
indic_train_dataset = rtrain_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
    fn_kwargs={"tokenizer": indic_tokenizer}
) 

     

Running tokenizer on train dataset #0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

Running tokenizer on train dataset #1:   0%|          | 0/5 [00:00<?, ?ba/s]

 

Running tokenizer on train dataset #2:   0%|          | 0/5 [00:00<?, ?ba/s]

 

Running tokenizer on train dataset #3:   0%|          | 0/5 [00:00<?, ?ba/s]

In [21]:
# aligning validation dataset in token:ner_tag pair
eval_dataset = raw_datasets["validation"]
indic_eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
    fn_kwargs={"tokenizer": indic_tokenizer}
)

     

Running tokenizer on Validation dataset #0:   0%|          | 0/4 [00:00<?, ?ba/s]

 

Running tokenizer on Validation dataset #1:   0%|          | 0/4 [00:00<?, ?ba/s]

 

Running tokenizer on Validation dataset #2:   0%|          | 0/4 [00:00<?, ?ba/s]

 

Running tokenizer on Validation dataset #3:   0%|          | 0/4 [00:00<?, ?ba/s]

In [22]:
# setting training arguments
batch_size=8
args=TrainingArguments(
    output_dir='output_dir',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    evaluation_strategy = "epoch",
    learning_rate=2e-6)

### Training

In [23]:
indic_data_collator = DataCollatorForTokenClassification(indic_tokenizer) # will use in training

In [24]:
# Initialize our Trainer

indic_trainer = Trainer(
    model = indic_model,
    train_dataset=indic_train_dataset,
    eval_dataset=indic_eval_dataset,
    tokenizer=indic_tokenizer,
    data_collator=indic_data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    args=args,
)

In [25]:
indic_trainer.args # just to see training arguments

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_

In [26]:
# training the model
indic_train_result = indic_trainer.train()
metrics = indic_train_result.metrics

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Loc Precision,Loc Recall,Loc F1,Loc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.5049,0.495392,0.505526,0.277685,0.358466,10213,0.247794,0.068874,0.107788,9786,0.48606,0.326646,0.390719,10568,0.45114,0.227762,0.302702,0.858342
2,0.4265,0.434264,0.526862,0.417703,0.465975,10213,0.316867,0.248416,0.278497,9786,0.583883,0.443603,0.504167,10568,0.478402,0.37246,0.418836,0.874081
3,0.4018,0.420833,0.514901,0.456771,0.484097,10213,0.355334,0.255263,0.297098,9786,0.576708,0.478425,0.522989,10568,0.491572,0.399745,0.440928,0.877084


In [27]:
from transformers import AutoModelForSequenceClassification

# Assuming indic_trainer is your Trainer object and you've trained the model
# indic_train_result = indic_trainer.train()
trained_model = indic_trainer.model

# Save the trained model
trained_model.save_pretrained("/kaggle/working/indic_trainer")


In [28]:
!zip -r BertTuned.zip /kaggle/working/indic_trainer

  adding: kaggle/working/indic_trainer/ (stored 0%)
  adding: kaggle/working/indic_trainer/model.safetensors (deflated 7%)
  adding: kaggle/working/indic_trainer/config.json (deflated 56%)


In [29]:
metrics = indic_trainer.evaluate()
indic_trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_LOC_f1             =     0.4841
  eval_LOC_number         =      10213
  eval_LOC_precision      =     0.5149
  eval_LOC_recall         =     0.4568
  eval_ORG_f1             =     0.2971
  eval_ORG_number         =       9786
  eval_ORG_precision      =     0.3553
  eval_ORG_recall         =     0.2553
  eval_PER_f1             =      0.523
  eval_PER_number         =      10568
  eval_PER_precision      =     0.5767
  eval_PER_recall         =     0.4784
  eval_loss               =     0.4208
  eval_overall_accuracy   =     0.8771
  eval_overall_f1         =     0.4409
  eval_overall_precision  =     0.4916
  eval_overall_recall     =     0.3997
  eval_runtime            = 0:04:32.39
  eval_samples_per_second =     49.414
  eval_steps_per_second   =      6.179


### Evaluate the Trained Model¶
Let us now evaluate the trained model on the test sets of all languages

We need to first tokenize the test sets

In [27]:
indic_tokenized_test_set = raw_datasets['test'].map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=6,
    load_from_cache_file=True,
    desc="Running tokenizer on test dataset",
    fn_kwargs={"tokenizer": indic_tokenizer}
)

       

Running tokenizer on test dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on test dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on test dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #5:   0%|          | 0/1 [00:00<?, ?ba/s]

Run prediction on test set of each of the language separately and extract overall Precison, Recall and F-Score separately

In [28]:
indic_Evaluation_metric = {}
predictions, labels, metrics = indic_trainer.predict(indic_tokenized_test_set)
for key in metrics:
    if 'overall_precision' in key:
      indic_Evaluation_metric['Precision'] = metrics[key]
    elif 'overall_recall' in key:
      indic_Evaluation_metric['Recall'] = metrics[key]
    elif 'overall_f1' in key:
      indic_Evaluation_metric['F1'] = metrics[key]
indic_Evaluation_metric

{'Precision': 0.6545648691938067,
 'Recall': 0.6355624675997926,
 'F1': 0.6449237243556023}

### Pre_processing for Q4

In [12]:
with open('/kaggle/input/for-q4-indic-bert/q1 part1.txt') as f:
    part1 = f.read()
with open('/kaggle/input/for-q4-indic-bert/q1 part 2.txt') as f:
    part2 = f.read()
with open('/kaggle/input/for-q4-indic-bert/q1 part 3.txt') as f:
    part3 = f.read()
sentence = [part1,part2,part3]

In [13]:
sentence

['इस बार कांग्रेस ने जो घोषणा पत्र जारी किया है उसमें आपके 6000 के सामने कांग्रेस के 72000 भारी पड़ेंगा? मोदी सरकार के पहले कार्यकाल में भी तीन तलाक को लेकर बिल लाया गया था, हालांकि तब यह राज्यसभा में पास नहीं हो पाया था. चुल्हे की संरचना सामान्यतया ब्युटेन पर चलने वाले चुल्हे के समान ही होती है परंतु इनके (बायोगैस चुल्हे) बर्नर में वायु छिद्र का आकार बड़ा होता है। उसने तुरंत बैंक में जाकर पता किया तो उसके खाते से किसी ने एटीएम के माध्यम से तीस हजार रुपये निकाल लिए थे। इससे पहले भी कई अफेयर हुए हैं जिनमें कुछ तो सफल रिलेशनशिप में तब्दील हो गए वहीं कुछ अफेयर सिर्फ अफसाना बनकर रह गए।  संविधान का ८० प्रतिशत कार्य पुरा हो चुका है । उनके पिता का कोई पता नहीं चल पाया. साथ ही शादी का सामान यानी गहनों और कपड़ों का भी कुछ पता नहीं चल पाया है. नीति आयोग के CEO अमिताभ कांत ने कहा कि देश में पिछले तीन साल में प्रति ग्राहक मोबाइल डाटा कंज्मप्शन में 142 फीसदी की ग्रोथ दर्ज की गई है।',
 'इनेलो हर वर्ग के लोगों की सुरक्षा करने में सक्षम : सतबीर कादियान मीडिया रिपोर्टों के मुताबिक एक चर्च के बाहर सुरक्ष

In [15]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('/kaggle/input/indic-bert-fine-tuned/kaggle/working/indic_trainer', num_labels=num_labels, finetuning_task='ner')
indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indic_model = AutoModelForTokenClassification.from_pretrained('/kaggle/input/indic-bert-fine-tuned/kaggle/working/indic_trainer')

2024-03-13 14:10:23.657666: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 14:10:23.657820: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 14:10:23.880912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [16]:
predicted_labels = []
for part in sentence:
    labels = get_predictions(sentence=part,
                                   tokenizer=indic_tokenizer,
                                   model=indic_model
                                   )
    for tag in labels:
        predicted_labels.append(tag)

In [18]:
label_mapping = {'0':'O', '1':'B-PER', '2':'I-PER', '3':'B-ORG', '4':'I-ORG', '5':'B-LOC', '6':'I-LOC'}

In [19]:
pred_labels = []
for tag in predicted_labels:
    pred_labels.append(label_mapping[tag[-1]])

In [22]:
import pickle

# File path where you want to save the object
file_path = '/kaggle/working/indic_tags.pkl'

# Open the file in binary write mode
with open(file_path, 'wb') as file:
    # Serialize the object and write it to the file
    pickle.dump(pred_labels, file)

print("Object saved to:", file_path)

Object saved to: /kaggle/working/indic_tags.pkl
