In [None]:
# Designed by
# Dr. Pruthwik Mishra, Assistant Professor, SVNIT, Surat
# Jadav Nitaben Sampatsinh, PhD Scholar, SVNIT, Surat
# Outline of this notebook
"""
1. Tokenizers
2. Introduction of different types of models
   2.1 Sequence Classification
   2.2 Token Classification
   2.3 Sequence Generation
   2.4 LLMs
   2.5 Encoder-only, Decoder-only, Encoder-Decoder Models
3. Different NLP Usecases
4. Prompting Strategies in LLMs
   4.1 Zero-Shot
   4.2 Few-Shot
   4.3 Chain-of-Thought
"""

'\n1. Tokenizers\n2. Introduction of different types of models\n   2.1 Sequence Classification\n   2.2 Token Classification\n   2.3 Sequence Generation\n   2.4 LLMs\n   2.5 Encoder-only, Decoder-only, Encoder-Decoder Models\n3. Different NLP Usecases\n4. Prompting Strategies in LLMs\n   4.1 Zero-Shot\n   4.2 Few-Shot\n   4.3 Chain-of-Thought\n'

#1. Tokenizers


In [None]:
# #tokenization using transformer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens=tokenizer.tokenize("Hello,How are you")
print(tokens)

#Task:tokenize sentence: "I've been waiting for a HuggingFace course my whole life."
#Can you tell me the difference between python split and this toke

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Hello', ',', 'How', 'are', 'you']


In [None]:
#converting tokens to IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

#converting IDs to IDs
decoding=tokenizer.decode(ids)
print(decoding)

[8667, 117, 1731, 1132, 1128]
Hello, How are you


In [None]:
#Handling multiple sentences
statements=["How are you?","I'm fine, thank you!"]
encoded_input = tokenizer(statements)
print(encoded_input)


{'input_ids': [[101, 1731, 1132, 1128, 136, 102], [101, 146, 112, 182, 2503, 117, 6243, 1128, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
#for getting tensor of sentences
#padding, truncation for adjusting the same length for different sentences
#max_length for controlling size of tensor


encoded_input = tokenizer(statements,
    padding=True,
    truncation=True,
    max_length=10,
    return_tensors="pt",
)
print(encoded_input)

#exercise 2: Observe how attention mask is changing for different length of tensor and tell your conclusion
#try to tokenizing other language text and see the result

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,    0,    0,    0,    0],
        [ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
#Fine tune tokenizer using customizeed corpus
from transformers import AutoTokenizer


corpus = [
    "भाषा की बाधाओं को तोड़ें और आसानी से व्यापक दर्शकों तक पहुँचें.",
    "अपनी वेबसाइट का 22 से अधिक भारतीय भाषाओं में आसानी से अनुवाद करें|",
    "डिजिटल इंडिया निगम के तहत स्वतंत्र प्रभाग"
]
# Using Hindi Tokenizer, supports most of the Indian languages
hindi_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
tokens = hindi_tokenizer.tokenize("भारत में भाषा की बाधा को तोड़ना")
print(tokens)
print(len(tokens))
print(hindi_tokenizer.vocab_size)
tokenizer = hindi_tokenizer.train_new_from_iterator(corpus, 55000)  # training corpus , vocab size
tokens2 = tokenizer.tokenize("भारत में भाषा की बाधा को तोड़ना")
print(tokens2)
print(len(tokens2))
print(tokenizer.convert_tokens_to_ids(tokens2))
# Using another Hindi tokenizer, custom made for Indian languages
hindi_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
# Handles Indian languages better
tokens = hindi_tokenizer.tokenize("भारत में भाषा की बाधा को तोड़ना")
print(tokens)
print(len(tokens))
print(hindi_tokenizer.vocab_size)
tokenizer = hindi_tokenizer.train_new_from_iterator(corpus, 100)  # training corpus , vocab size
tokens2 = tokenizer.tokenize("भारत में भाषा की बाधा को तोड़ना")
print(tokens2)
print(len(tokens2))
print(tokenizer.convert_tokens_to_ids(tokens2))
#exercise 3: Try different vocab size, more corpus and test with different words

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

['भारत', 'में', 'भाषा', 'की', 'ब', '##ा', '##धा', 'को', 'तो', '##ड़', '##ना']
11
119547
['भारत', 'में', 'भाषा', 'की', 'बा', '##ध', '##ा', 'को', 'तोड़', '##न', '##ा']
11
[140, 112, 142, 100, 111, 68, 49, 102, 150, 52, 49]


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

['भारत', 'में', 'भाषा', 'की', 'बाधा', 'को', 'तोड़', '##ना']
8
197285
['भा', '##र', '##त', 'म', '##ें', 'भाष', '##ा', 'क', '##ी', 'ब', '##ा', '##ध', '##ा', 'को', 'त', '##ो', '##ड', '##़', '##न', '##ा']
20
[81, 45, 63, 28, 80, 89, 54, 15, 64, 26, 54, 71, 54, 99, 21, 49, 51, 69, 58, 54]


In [None]:
from transformers import BertTokenizer, BertForNextSentencePrediction

In [None]:
sentences = [("The quick brown fox jumps over the lazy dog.", "The dog was not amused.", 1)]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], add_special_tokens=True)

In [None]:
examples = {'sentence1': 'The quick brown fox jumps over the lazy dog.', 'sentence2': 'The dog was not amused.'}

In [None]:
tokenize_function(examples)

{'input_ids': [101, 1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899, 1012, 102, 1996, 3899, 2001, 2025, 11770, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
from torch.nn.functional import softmax

In [None]:
sentence_a = "The quick brown fox jumps over the lazy dog."
sentence_b = "The dog was sleeping soundly."
sentence_c = "I like to eat pizza." # A random, unrelated sentence

In [None]:
import torch

In [None]:
# tokenize
# Encoding for related sentences (sentence_b follows sentence_a)
encoded_related = tokenizer.encode_plus(
    sentence_a,
    text_pair=sentence_b,
    return_tensors='pt'
)

# Encoding for unrelated sentences (sentence_c does not follow sentence_a)
encoded_unrelated = tokenizer.encode_plus(
    sentence_a,
    text_pair=sentence_c,
    return_tensors='pt'
)

In [None]:
# Prediction for related sentences
with torch.no_grad(): # Disable gradient calculation for inference
    outputs_related = model(**encoded_related)
    logits_related = outputs_related.logits

# Prediction for unrelated sentences
with torch.no_grad():
    outputs_unrelated = model(**encoded_unrelated)
    logits_unrelated = outputs_unrelated.logits

In [None]:
probs_related = softmax(logits_related, dim=1)
predicted_class_related = torch.argmax(probs_related, dim=1).item()

probs_unrelated = softmax(logits_unrelated, dim=1)
predicted_class_unrelated = torch.argmax(probs_unrelated, dim=1).item()

print(f"Sentence A: \"{sentence_a}\"")
print(f"Sentence B: \"{sentence_b}\"")
print(f"Probabilities (Related): {probs_related}")
print(f"Prediction (0=IsNext, 1=NotNext): {predicted_class_related}")
print("-" * 30)

print(f"Sentence A: \"{sentence_a}\"")
print(f"Sentence C: \"{sentence_c}\"")
print(f"Probabilities (Unrelated): {probs_unrelated}")
print(f"Prediction (0=IsNext, 1=NotNext): {predicted_class_unrelated}")

Sentence A: "The quick brown fox jumps over the lazy dog."
Sentence B: "The dog was sleeping soundly."
Probabilities (Related): tensor([[0.9931, 0.0069]])
Prediction (0=IsNext, 1=NotNext): 0
------------------------------
Sentence A: "The quick brown fox jumps over the lazy dog."
Sentence C: "I like to eat pizza."
Probabilities (Unrelated): tensor([[0.0034, 0.9966]])
Prediction (0=IsNext, 1=NotNext): 1


In [None]:
#save your tokenizer
hindi_tokenizer.save_pretrained("hindi-tokenizer")

('hindi-tokenizer/tokenizer_config.json',
 'hindi-tokenizer/special_tokens_map.json',
 'hindi-tokenizer/vocab.txt',
 'hindi-tokenizer/added_tokens.json',
 'hindi-tokenizer/tokenizer.json')

In [None]:
#load your tokenizer
tokenizer = AutoTokenizer.from_pretrained("hindi-tokenizer")
print(tokenizer.tokenize("भाषा की बाधा"))


['भाषा', 'की', 'बाधा']


In [None]:
#Tokenizers library for fast tokenization
#training tokenizar on wikitext datset
!pip install -U datasets

Collecting datasets
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Using cached datasets-4.0.0-py3-none-any.whl (494 kB)
Using cached fsspec-2025.3.0-py3-none-any.whl (193 kB)
Installing collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_mac

In [None]:
from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

train_texts = dataset['train']['text']
train_texts = [line for line in train_texts if line.strip() != '']  #removing empty lines

# Wrap text into iterator (required by trainer)
def batch_iterator():
    for i in range(0, len(train_texts), 1000):
        yield train_texts[i:i+1000]


tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

In [None]:
#Save the trained tokenizer
tokenizer.save("wikitext-bpe-tokenizer.json")

In [None]:
#reload the same tokenizer and test it
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("wikitext-bpe-tokenizer.json")
output = tokenizer.encode("I've been waiting for a HuggingFace course my whole life.")
print(output.tokens)      # List of subword tokens
print(output.ids)         # Corresponding token IDs


['I', "'", 've', 'been', 'waiting', 'for', 'a', 'Hu', 'gging', 'Face', 'course', 'my', 'whole', 'life', '.']
[45, 11, 1058, 1255, 10019, 1056, 69, 3520, 28105, 9293, 3937, 1474, 4099, 1788, 18]


# 2. Introduction of different types of models

  2.1 Sequence Classification

In [None]:
#classify multiple statements as positive or negative with confidence (encoder only )

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
output

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9138]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
probs = F.softmax(output.logits, dim=-1)
labels = ["negative", "positive"]
predicted_class = torch.argmax(probs, dim=1).tolist()
for idx, pred in enumerate(predicted_class):
    confidence = probs[idx][pred].item()
    print(f"Predicted: {labels[pred]} with confidence {confidence:.4f}")

Predicted: positive with confidence 0.9598
Predicted: positive with confidence 0.9995


In [None]:
#Exercise 3: classify movie reviews as good and bad

2.2 Token Classification

In [None]:
import numpy as np
#POS tagging example
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load pre-trained POS tagging model on Universal POS Tags
model_name = "vblagoje/bert-english-uncased-finetuned-pos"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer)

text = "This is a sample sentence for POS tagging."
results = nlp(text)
print(results)
final_list = []
# word = ''
subword_tags = []
subword_scores = []
for index, r in enumerate(results):
    if r['word'].startswith('##'):
      word += r['word'][2:]
      subword_tags.append(r['entity'])
      subword_scores.append(r['score'])
    else:
      if subword_scores and subword_tags:
        for i in range(len(subword_scores)):
          final_list.pop()
        final_list.append((word, subword_tags[0], np.mean(subword_scores)))
        word = r['word']
        final_list.append((word, r['entity'], r['score']))
        subword_tags = []
        subword_scores = []
      else:
        word = r['word']
        final_list.append((word, r['entity'], r['score']))
for word, tag, score in final_list:
  print(f"{word} → {tag} → {score:.2f}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity': 'PRON', 'score': np.float32(0.99934775), 'index': 1, 'word': 'this', 'start': 0, 'end': 4}, {'entity': 'AUX', 'score': np.float32(0.9979938), 'index': 2, 'word': 'is', 'start': 5, 'end': 7}, {'entity': 'DET', 'score': np.float32(0.99942315), 'index': 3, 'word': 'a', 'start': 8, 'end': 9}, {'entity': 'NOUN', 'score': np.float32(0.9961266), 'index': 4, 'word': 'sample', 'start': 10, 'end': 16}, {'entity': 'NOUN', 'score': np.float32(0.99879634), 'index': 5, 'word': 'sentence', 'start': 17, 'end': 25}, {'entity': 'ADP', 'score': np.float32(0.99871266), 'index': 6, 'word': 'for', 'start': 26, 'end': 29}, {'entity': 'NOUN', 'score': np.float32(0.9903418), 'index': 7, 'word': 'po', 'start': 30, 'end': 32}, {'entity': 'NOUN', 'score': np.float32(0.8936688), 'index': 8, 'word': '##s', 'start': 32, 'end': 33}, {'entity': 'NOUN', 'score': np.float32(0.982991), 'index': 9, 'word': 'tag', 'start': 34, 'end': 37}, {'entity': 'NOUN', 'score': np.float32(0.96724784), 'index': 10, 'word': 

In [None]:
#exercise 4: try with ambiguous word statements like "I need to book a flight"

  2.3 Sequence Generation

In [None]:
#decoder only

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Input prompt
prompt = "Hugging face will"

inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
outputs = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Hugging face will be a big deal for the team.

"I think it's a good thing for the team," said coach Mike Babcock. "I think it's a good thing for the fans. I think it's a good


In [None]:
#exercise 4: Try generating texts with different promts

In [None]:
#or you can just use pipeline for MLM
from transformers import pipeline

fill_mask = pipeline("fill-mask", model="bert-base-cased")
result = fill_mask("The [MASK] is round.")
print(result) # will return a dictionary
for r in result:
    print(f"{r['sequence']} (score: {r['score']:.4f})")
result_1 = fill_mask("Radha is a [MASK].")
print(result_1)
result_2 = fill_mask("Mohan is a [MASK].")
print(result_2)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.10309870541095734, 'token': 11564, 'token_str': 'wingspan', 'sequence': 'The wingspan is round.'}, {'score': 0.049367811530828476, 'token': 15764, 'token_str': 'apex', 'sequence': 'The apex is round.'}, {'score': 0.038944218307733536, 'token': 22769, 'token_str': 'aperture', 'sequence': 'The aperture is round.'}, {'score': 0.03769390657544136, 'token': 8895, 'token_str': 'orbit', 'sequence': 'The orbit is round.'}, {'score': 0.031573500484228134, 'token': 1946, 'token_str': 'seat', 'sequence': 'The seat is round.'}]
The wingspan is round. (score: 0.1031)
The apex is round. (score: 0.0494)
The aperture is round. (score: 0.0389)
The orbit is round. (score: 0.0377)
The seat is round. (score: 0.0316)
[{'score': 0.05080442130565643, 'token': 9227, 'token_str': 'dancer', 'sequence': 'Radha is a dancer.'}, {'score': 0.03955294191837311, 'token': 2483, 'token_str': 'singer', 'sequence': 'Radha is a singer.'}, {'score': 0.036932144314050674, 'token': 3218, 'token_str': 'teacher', '

In [None]:
#exercise 5: explore pipeline for various tasks.
 #Use it for extracting answer of question "who is the president of India" from the context: "Smt. Druapadi Murmu is very kind persident of India"
 #Use it for filling the blanks of sentence: "Radha is a ______." and "Mohan is a ______."

2.4 LLMs

In [None]:
#Text summarization with open sourced LLM using pipeline (encoder-decoder)

from transformers import pipeline

# Load any open-source LLM with text-generation
summarizer = pipeline("text-generation", model="tiiuae/falcon-7b-instruct") #can use other LLM like mistral

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
prompt = """
Summarize the following text in 1 sentence:

Artificial Intelligence (AI) is transforming the way businesses operate by automating tasks, enabling smarter decision-making, and creating new markets.
As AI technologies evolve, industries such as healthcare, transportation, and finance are seeing dramatic changes.
Ethical considerations and responsible AI deployment remain critical.
"""

output = summarizer(prompt, max_length=100, temperature=0.7)

print(output[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Summarize the following text in 1 sentence:

Artificial Intelligence (AI) is transforming the way businesses operate by automating tasks, enabling smarter decision-making, and creating new markets.
As AI technologies evolve, industries such as healthcare, transportation, and finance are seeing dramatic changes. 
Ethical considerations and responsible AI deployment remain critical.
Incorporating AI into everyday business operations is revolutionizing industries, while ethical considerations and responsible deployment remain vital.


#3. Different NLP Usecases

In [None]:
#TTS

!pip install transformers datasets torchaudio soundfile
!pip install transformers scipy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
#TTS
from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile as wavfile
import torch

# Load Bark processor and model
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")

# Input text
text_prompt = "Hello! Welcome to the Hugging face tutorial."

# Prepare inputs
inputs = processor(text_prompt, return_tensors="pt")

# Generate audio (TTS)
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy()[0]

# Save to file
wavfile.write("bark_tts.wav", rate=model.generation_config.sample_rate, data=audio_array)




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [None]:
#play the audio file
from IPython.display import Audio
Audio("bark_tts.wav")


In [None]:
# ASR : for the generated audio file in above task
from transformers import pipeline

# Load Whisper ASR model
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# If you upload an audio file (File > Upload in Colab), use its path:
result = asr("/content/bark_tts.wav")

print("Transcription:", result['text'])


Device set to use cuda:0


Transcription:  Hello! Welcome to the Hugging Face tutorial.


# 4. Prompting Strategies in LLMs

 Zero-Shot Prompting

In [None]:
#load open source LLM
from transformers import pipeline
model = pipeline("text-generation", model="tiiuae/falcon-7b-instruct") #lamma 3.1, 3.2 3bn


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
prompt = "Write a Python function to calculate the factorial of a number."
response = model(prompt)
print(response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


[{'generated_text': 'Write a Python function to calculate the factorial of a number.\ndef factorial(n):\n    if n == 0:\n        return 1\n    else:\n        return n * factorial(n -1)'}]


Few-Shot Prompting

In [None]:
prompt ="""Task: Write Python functions.

Example 1:
Input: Write a function to add two numbers.
Output:
def add(a, b):
    return a + b

Example 2:
Input: Write a function to compute the square of a number.
Output:
def square(n):
    return n * n

Input: Write a function to check if a number is even.
Output:
"""
response = model(prompt)
print(response)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


[{'generated_text': 'Task: Write Python functions.\n\nExample 1:\nInput: Write a function to add two numbers.\nOutput:\ndef add(a, b):\n    return a + b\n\nExample 2:\nInput: Write a function to compute the square of a number.\nOutput:\ndef square(n):\n    return n * n\n\nInput: Write a function to check if a number is even.\nOutput:\ndef is_even(n):\n    if n % 2 == 0:\n        return True\n    else:\n        return False\n\nInput: Write a function to print out the first n characters of a string.\nOutput:\ndef print_substring(s, n):\n    return s[:n]\n\nInput: Write a function to reverse a string.\nOutput:\ndef reverse_string(s):\n    return s[::-1]\n\nInput: Write a function to print out a message.\nOutput:\ndef print_msg(msg):\n    print(msg)\n\nInput: Write a function to convert a string to a number.\nOutput:\ndef convert_string_to_num(s):\n    return int(s)\n\nInput: Write a function to find the sum of two numbers.\nOutput:\ndef sum(a, b):\n    return a + b\n\nInput: Write a funct

Chain-of-Thought (CoT)

In [None]:
prompt ="""
Explain step by step what this Python code does:

def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True
"""
response = model(prompt)
print(response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


[{'generated_text': "\nExplain step by step what this Python code does:\n\ndef is_prime(n):\n    if n <= 1:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True\n\n1. Define a function called 'is_prime' which takes an integer n as input.\n2. Use a for loop to check if n is less than or equal to 1.\n3. If n is less than or equal to 1, return False immediately.\n4. Otherwise, check if n is a factor of any integer between n and n*n.\n5. If n is a factor of any integer between n and n*n, return False.\n6. If n is not a factor of any integer between n and n*n, return True.\n7. Return the result of the function called 'is_prime' to indicate whether a given number is prime or not."}]


In [None]:
!git clone https://github.com/VarunGumma/IndicTransToolkit
%cd IndicTransToolkit
# in case it fails, try:
!pip install --editable . --use-pep517
#  --config-settings editable_mode=compat

Cloning into 'IndicTransToolkit'...
remote: Enumerating objects: 245, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 245 (delta 74), reused 108 (delta 49), pack-reused 95 (from 1)[K
Receiving objects: 100% (245/245), 4.45 MiB | 7.80 MiB/s, done.
Resolving deltas: 100% (102/102), done.
/content/IndicTransToolkit
Obtaining file:///content/IndicTransToolkit
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sacremoses (from IndicTransToolkit==1.0.4)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting sacrebleu (from IndicTransToolkit==1.0.4)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/5

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor
# recommended to run this on a gpu with flash_attn installed
# don't set attn_implemetation if you don't have flash_attn
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

src_lang, tgt_lang = "hin_Deva", "eng_Latn"
model_name_en = "ai4bharat/indictrans2-indic-en-1B"
model_name_indic = "ai4bharat/indictrans2-indic-indic-1B"

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tgt_lang = "guj_Gujr"
if tgt_lang == "eng_Latn":
  tokenizer = AutoTokenizer.from_pretrained(model_name_en, trust_remote_code=True)
  model = AutoModelForSeq2SeqLM.from_pretrained(
      model_name_en,
      trust_remote_code=True,
      torch_dtype=torch.float16, # performance might slightly vary for bfloat16
      attn_implementation="flash_attention_2"
  ).to(DEVICE)
else:
   tokenizer = AutoTokenizer.from_pretrained(model_name_indic, trust_remote_code=True)
   model = AutoModelForSeq2SeqLM.from_pretrained(
      model_name_indic,
      trust_remote_code=True,
      torch_dtype=torch.float16, # performance might slightly vary for bfloat16
      attn_implementation="flash_attention_2"
  ).to(DEVICE)
ip = IndicProcessor(inference=True)

input_sentences = [
    "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
    "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
    "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
    "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
]

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model
with torch.no_grad():
    generated_tokens = model.generate(
        **inputs,
        use_cache=True,
        min_length=0,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
    )

# Decode the generated tokens into text
generated_tokens = tokenizer.batch_decode(
    generated_tokens,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True,
)

# Postprocess the translations, displaying the source and target sentences
translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)

for input_sentence, translation in zip(input_sentences, translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")
# --------------------------
# Hindi -> English Translation
# hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
# eng_Latn: When I was young, I used to go to the park every day.
# hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
# eng_Latn: We saw a new movie last week that was very inspiring.
# hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
# eng_Latn: If you'd given me a pass at that time, we'd have gone out to eat.
# hin_Deva: मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।
# eng_Latn: My friend has invited me to her birthday party, and I'll give her a present.
# ----------------------------
# Hindi -> Gujarati Translation
# hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
# guj_Gujr: જ્યારે હું નાનો હતો ત્યારે હું દરરોજ પાર્કમાં જતો હતો.
# hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
# guj_Gujr: અમે ગયા અઠવાડિયે એક નવી ફિલ્મ જોઈ જે ખૂબ જ પ્રેરણાદાયક હતી.
# hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
# guj_Gujr: જો તમને તે સમયે મને પાસ મળે, તો અમે બહાર જમવા જઈશું.
# hin_Deva: मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।
# guj_Gujr: મારા મિત્રે મને તેના જન્મદિવસની પાર્ટીમાં આમંત્રણ આપ્યું છે અને હું તેને ભેટ આપીશ.
# ----------------------------


hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
guj_Gujr: જ્યારે હું નાનો હતો ત્યારે હું દરરોજ પાર્કમાં જતો હતો.
hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
guj_Gujr: અમે ગયા અઠવાડિયે એક નવી ફિલ્મ જોઈ જે ખૂબ જ પ્રેરણાદાયક હતી.
hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
guj_Gujr: જો તમને તે સમયે મને પાસ મળે, તો અમે બહાર જમવા જઈશું.
hin_Deva: मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।
guj_Gujr: મારા મિત્રે મને તેના જન્મદિવસની પાર્ટીમાં આમંત્રણ આપ્યું છે અને હું તેને ભેટ આપીશ.


In [None]:
# Masked LM for English
import torch
from transformers import pipeline

pipeline_en_mask = pipeline(
    task="fill-mask",
    model="google-bert/bert-base-uncased",
    torch_dtype=torch.float16,
    device=0
)
pipeline_en_mask("Plants create [MASK] through a process known as photosynthesis.")

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'score': 0.151123046875,
  'token': 2943,
  'token_str': 'energy',
  'sequence': 'plants create energy through a process known as photosynthesis.'},
 {'score': 0.1453857421875,
  'token': 4870,
  'token_str': 'flowers',
  'sequence': 'plants create flowers through a process known as photosynthesis.'},
 {'score': 0.0821533203125,
  'token': 9325,
  'token_str': 'sunlight',
  'sequence': 'plants create sunlight through a process known as photosynthesis.'},
 {'score': 0.04296875,
  'token': 18670,
  'token_str': 'algae',
  'sequence': 'plants create algae through a process known as photosynthesis.'},
 {'score': 0.037628173828125,
  'token': 12649,
  'token_str': 'atp',
  'sequence': 'plants create atp through a process known as photosynthesis.'}]

In [None]:
tokenizer = BertTokenizer.from_pretrained('google/muril-base-cased')
model = BertForMaskedLM.from_pretrained('google/muril-base-cased')
# text = "The [MASK] is round."
text = "राम ने [MASK] को मारा ."
inputs = tokenizer(text, return_tensors='pt')
outputs = model(**inputs)
logits = outputs.logits

# Find index of [MASK] token
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

# Get logits for the [MASK] position
mask_token_logits = logits[0, mask_token_index, :]

# Top 5 predicted tokens
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    word = tokenizer.decode([token])
    print("Predicted word:", word)


Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted word: रावण
Predicted word: लक्ष्मण
Predicted word: सुग्रीव
Predicted word: हनुमान
Predicted word: शत्रुघ्न


In [None]:
pipeline_hin_mask = pipeline(
    task="fill-mask",
    model="google/muril-base-cased",
    torch_dtype=torch.float16,
    device=0
)
outputs = pipeline_hin_mask("राम ने [MASK] को मारा .")
for output in outputs:
  print('Masked token is:', output['token_str'], 'with a score of', output['score'])
  print('Sentence is:', output['sequence'])
  print('----------------')

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Masked token is: रावण with a score of 0.75390625
Sentence is: राम ने रावण को मारा.
----------------
Masked token is: लक्ष्मण with a score of 0.09588623046875
Sentence is: राम ने लक्ष्मण को मारा.
----------------
Masked token is: सुग्रीव with a score of 0.01338958740234375
Sentence is: राम ने सुग्रीव को मारा.
----------------
Masked token is: हनुमान with a score of 0.00864410400390625
Sentence is: राम ने हनुमान को मारा.
----------------
Masked token is: शत्रुघ्न with a score of 0.00824737548828125
Sentence is: राम ने शत्रुघ्न को मारा.
----------------


In [None]:
from transformers import pipeline

model_name = "l3cube-pune/hindi-question-answering-squad-bert"
qa_pipeline = pipeline("question-answering", model=model_name)

context = "भारतीय क्रिकेट टीम के कप्तान रोहित शर्मा हैं।"
question = "भारतीय क्रिकेट टीम के कप्तान कौन हैं?"
question = "भारतीय क्रिकेट टीम के कप्तान का नाम क्या है?"

result = qa_pipeline(question=question, context=context)
print(result)
print(f"Answer: {result['answer']}")

Device set to use cuda:0


{'score': 0.36955195665359497, 'start': 21, 'end': 34, 'answer': ' कप्तान रोहित'}
Answer:  कप्तान रोहित


In [None]:
# Hindi Named Entity Recognizer
from transformers import pipeline

pipe = pipeline("token-classification", model="Sankalp-Bahad/Monolingual-Hindi-NER-Model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
pipe('राम ने रावण को मारा .')

[{'entity': 'B-NEP',
  'score': np.float32(0.99661875),
  'index': 1,
  'word': '▁राम',
  'start': 0,
  'end': 3},
 {'entity': 'B-NEP',
  'score': np.float32(0.996167),
  'index': 3,
  'word': '▁रावण',
  'start': 7,
  'end': 11}]

In [None]:
pipe('संस्थान को भारत सरकार द्वारा एम.टेक. और पीएच.डी. के लिए गुणवत्ता सुधार कार्यक्रम के केंद्रों में से एक के रूप में मान्यता दी गई है ।')

[{'entity': 'B-NEO',
  'score': np.float32(0.9924527),
  'index': 3,
  'word': '▁भारत',
  'start': 11,
  'end': 15},
 {'entity': 'I-NEO',
  'score': np.float32(0.98173517),
  'index': 4,
  'word': '▁सरकार',
  'start': 16,
  'end': 21},
 {'entity': 'B-NEN',
  'score': np.float32(0.999876),
  'index': 26,
  'word': '▁एक',
  'start': 100,
  'end': 102}]

In [None]:
pipe('भारत सरकार ने 4 दिसंबर, 2002 को इसे \' मानद विश्वविद्यालय \' का दर्जा दे दिया । ')

[{'entity': 'B-NEO',
  'score': np.float32(0.95419556),
  'index': 1,
  'word': '▁भारत',
  'start': 0,
  'end': 4},
 {'entity': 'I-NEO',
  'score': np.float32(0.9938014),
  'index': 2,
  'word': '▁सरकार',
  'start': 5,
  'end': 10},
 {'entity': 'B-NETI',
  'score': np.float32(0.99871445),
  'index': 4,
  'word': '▁4',
  'start': 14,
  'end': 15},
 {'entity': 'I-NETI',
  'score': np.float32(0.9993636),
  'index': 5,
  'word': '▁दिसंबर',
  'start': 16,
  'end': 22},
 {'entity': 'I-NETI',
  'score': np.float32(0.9994661),
  'index': 6,
  'word': ',',
  'start': 22,
  'end': 23},
 {'entity': 'I-NETI',
  'score': np.float32(0.99950624),
  'index': 7,
  'word': '▁2002',
  'start': 24,
  'end': 28}]