In [6]:
### Word Embedding

# TF and IDF
# TF - Term Frequency
# IDF - Inverse Document Frequency

# It is a statistical measure that evaluates how important a word is to a document in a collection or corpus

In [7]:
# Term Frequency - Measure how frequently a term report in a document
# Tf(t, d) = Number of times term t appears in document d / Total number of terms in document d

# Inverse Document Frequency (IDF) - Measure how important a term is in the entire corpus (Document)
# IDF (t) = log(N / (df(t)))
# N = Total number of documents in the corpus
# df(t) = Number of documents with term t in it
# 1 is added to avoid division by zero error

# TF -IDF Score
# TF - IDF(t, d) = tf(t, d) * log(N / (df(t))) or "TF - IDF(t, d) = tf(t, d) * IDF(t)"

# Why this is used?
#   - Down weight to common words like "the", "and", "is"
#   - Up-weight rare but informative words like - "machine", "neural", "quantum"

# Examples:-
# D1 = "the cat sat on the mat"
# D2 = "the dog sat on the log"
# D3 = "dogs and cart are great"

# The word "the" has appeared 4 times
# - high TF low IDF == low TF-IDF
# the word "great" has appeared 1 time
# - low TF high IDF == high TF-IDF


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
docs = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "dogs and cart are great"
]

In [10]:
vectorizer = TfidfVectorizer()
tfidfmatrix = vectorizer.fit_transform(docs)
print(tfidfmatrix.toarray())

[[0.         0.         0.         0.42755362 0.         0.
  0.         0.         0.42755362 0.32516555 0.32516555 0.6503311 ]
 [0.         0.         0.         0.         0.42755362 0.
  0.         0.42755362 0.         0.32516555 0.32516555 0.6503311 ]
 [0.4472136  0.4472136  0.4472136  0.         0.         0.4472136
  0.4472136  0.         0.         0.         0.         0.        ]]


In [11]:
df = pd.DataFrame(tfidfmatrix.toarray())#, columns=vectorizer.get_feature_names_out())
print(df)

         0         1         2         3         4         5         6   \
0  0.000000  0.000000  0.000000  0.427554  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.427554  0.000000  0.000000   
2  0.447214  0.447214  0.447214  0.000000  0.000000  0.447214  0.447214   

         7         8         9         10        11  
0  0.000000  0.427554  0.325166  0.325166  0.650331  
1  0.427554  0.000000  0.325166  0.325166  0.650331  
2  0.000000  0.000000  0.000000  0.000000  0.000000  


In [12]:
# Rows = Documents
# Cols = Words
# Values = TFIDF Score

In [13]:
# another examples

data = {'id': [1,2,3], 'text': ['I love Machine learning',
                                'Natural Language processing is fascinating ',
                                'words embedding represents words in vector']}
df = pd.DataFrame(data)
df

Unnamed: 0,id,text
0,1,I love Machine learning
1,2,Natural Language processing is fascinating
2,3,words embedding represents words in vector


In [14]:
vectorizer = TfidfVectorizer()
X_tfidf_matrix = vectorizer.fit_transform(df['text'])
print(X_tfidf_matrix.toarray())

[[0.         0.         0.         0.         0.         0.57735027
  0.57735027 0.57735027 0.         0.         0.         0.
  0.        ]
 [0.         0.4472136  0.         0.4472136  0.4472136  0.
  0.         0.         0.4472136  0.4472136  0.         0.
  0.        ]
 [0.35355339 0.         0.35355339 0.         0.         0.
  0.         0.         0.         0.         0.35355339 0.35355339
  0.70710678]]


In [15]:
print(pd.DataFrame(X_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()))


   embedding  fascinating        in        is  language  learning     love  \
0   0.000000     0.000000  0.000000  0.000000  0.000000   0.57735  0.57735   
1   0.000000     0.447214  0.000000  0.447214  0.447214   0.00000  0.00000   
2   0.353553     0.000000  0.353553  0.000000  0.000000   0.00000  0.00000   

   machine   natural  processing  represents    vector     words  
0  0.57735  0.000000    0.000000    0.000000  0.000000  0.000000  
1  0.00000  0.447214    0.447214    0.000000  0.000000  0.000000  
2  0.00000  0.000000    0.000000    0.353553  0.353553  0.707107  


In [16]:
### Hugging Face - Most popular transformer Library

## !pip install transformers datasets torch huggingface_hub

In [17]:
## Try to find the sentiment of a sentence  - positive or negative

from transformers import pipeline
classifier = pipeline("sentiment-analysis")
results = classifier(['I love transformer', 'This is terrible'])
print(results)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998635053634644}, {'label': 'NEGATIVE', 'score': 0.9996459484100342}]


In [18]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

# Collect both inputs
sentence1 = input("Enter the first sentence: ")
sentence2 = input("Enter the second sentence: ")

# Pass them together in a single list
results = classifier([sentence1, sentence2])

print(results)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Enter the first sentence: I am scared to go in the dark at night
Enter the second sentence: I love to play outside
[{'label': 'NEGATIVE', 'score': 0.9922244548797607}, {'label': 'POSITIVE', 'score': 0.9995712637901306}]


In [19]:
### NAMED ENTITY RECOGNITION

ner = pipeline('ner', grouped_entities = True)
text = "Elon Musk founded SpaceX in 2024 and later bought Twitter in 2025."
result = ner(text)
print(result)
for i in result:
  print(i)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'entity_group': 'PER', 'score': np.float32(0.9985181), 'word': 'Elon Musk', 'start': 0, 'end': 9}, {'entity_group': 'ORG', 'score': np.float32(0.9992113), 'word': 'SpaceX', 'start': 18, 'end': 24}, {'entity_group': 'ORG', 'score': np.float32(0.99866354), 'word': 'Twitter', 'start': 50, 'end': 57}]
{'entity_group': 'PER', 'score': np.float32(0.9985181), 'word': 'Elon Musk', 'start': 0, 'end': 9}
{'entity_group': 'ORG', 'score': np.float32(0.9992113), 'word': 'SpaceX', 'start': 18, 'end': 24}
{'entity_group': 'ORG', 'score': np.float32(0.99866354), 'word': 'Twitter', 'start': 50, 'end': 57}


In [31]:
text = input("Enter the sentence: ")
result = ner(text)
#print(result)
for i in result:
  print(i)

Enter the sentence: The AI war is comes to an interesting point at this time


In [21]:
## Fill in the blanks
## predict the missing word in the sequence

filled_data = pipeline('fill-mask', model='bert-base-uncased')
sent = "Transformer are a very ([MASK]) technology"
result = filled_data(sent)
print(result)
for i in result:
  print(i)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


[{'score': 0.13967962563037872, 'token': 2047, 'token_str': 'new', 'sequence': 'transformer are a very ( new ) technology'}, {'score': 0.13906064629554749, 'token': 3935, 'token_str': 'advanced', 'sequence': 'transformer are a very ( advanced ) technology'}, {'score': 0.08328744769096375, 'token': 2715, 'token_str': 'modern', 'sequence': 'transformer are a very ( modern ) technology'}, {'score': 0.0361173115670681, 'token': 6450, 'token_str': 'expensive', 'sequence': 'transformer are a very ( expensive ) technology'}, {'score': 0.03445785865187645, 'token': 3522, 'token_str': 'recent', 'sequence': 'transformer are a very ( recent ) technology'}]
{'score': 0.13967962563037872, 'token': 2047, 'token_str': 'new', 'sequence': 'transformer are a very ( new ) technology'}
{'score': 0.13906064629554749, 'token': 3935, 'token_str': 'advanced', 'sequence': 'transformer are a very ( advanced ) technology'}
{'score': 0.08328744769096375, 'token': 2715, 'token_str': 'modern', 'sequence': 'transfor

In [22]:
sent = input("Enter the sentence: ")
result = filled_data(sent)
print(result)
for i in result:
  print(i)


Enter the sentence: The AI war is comes to an ([MASK]) point at this time
[{'score': 0.09553032368421555, 'token': 2590, 'token_str': 'important', 'sequence': 'the ai war is comes to an ( important ) point at this time'}, {'score': 0.058716386556625366, 'token': 6827, 'token_str': 'essential', 'sequence': 'the ai war is comes to an ( essential ) point at this time'}, {'score': 0.040929004549980164, 'token': 6387, 'token_str': 'intense', 'sequence': 'the ai war is comes to an ( intense ) point at this time'}, {'score': 0.033607710152864456, 'token': 15741, 'token_str': 'unprecedented', 'sequence': 'the ai war is comes to an ( unprecedented ) point at this time'}, {'score': 0.032804399728775024, 'token': 11355, 'token_str': 'explosive', 'sequence': 'the ai war is comes to an ( explosive ) point at this time'}]
{'score': 0.09553032368421555, 'token': 2590, 'token_str': 'important', 'sequence': 'the ai war is comes to an ( important ) point at this time'}
{'score': 0.058716386556625366, 't

In [23]:
def fill_in_the_blank_Chatbot():
  print("Fill in the blanks ChatBot")
  print("Type a sentence with ([MASK]) in it or type EXIT to quit")

  while True:
    user_input = input("Enter the sentence: ")
    if user_input.upper() == "EXIT":
      print("Thanks for Using this ChatBot..!! GoodBye!!!")
      break

    if "[MASK]" not in user_input:
      print("No [MASK] found in the sentence")

    try:
      result = filled_data(user_input)
      print("Predictions are here below")
      for i in result[:5]:
        print(i)
    except:
      print("Something went wrong")


In [24]:
fill_in_the_blank_Chatbot()

Fill in the blanks ChatBot
Type a sentence with ([MASK]) in it or type EXIT to quit
Enter the sentence: The AI war is comes to an interesting ([MASK]) at this time
Predictions are here below
{'score': 0.33002418279647827, 'token': 2203, 'token_str': 'end', 'sequence': 'the ai war is comes to an interesting ( end ) at this time'}
{'score': 0.21622614562511444, 'token': 2391, 'token_str': 'point', 'sequence': 'the ai war is comes to an interesting ( point ) at this time'}
{'score': 0.1578916311264038, 'token': 7091, 'token_str': 'conclusion', 'sequence': 'the ai war is comes to an interesting ( conclusion ) at this time'}
{'score': 0.052902814000844955, 'token': 9190, 'token_str': 'halt', 'sequence': 'the ai war is comes to an interesting ( halt ) at this time'}
{'score': 0.04383029043674469, 'token': 2707, 'token_str': 'start', 'sequence': 'the ai war is comes to an interesting ( start ) at this time'}
Enter the sentence: India is very ([MASK]) country.
Predictions are here below
{'scor

In [25]:
## ZERO SHOT classification

## Classify text into user defined Categories

clf = pipeline('zero-shot-classification')
text = "The stock market crashed due to infation and war"
labels = ['Finance', 'Technology', 'Sports', 'Health', 'Politics']
res = clf(text, labels)
print(res)


No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


{'sequence': 'The stock market crashed due to infation and war', 'labels': ['Finance', 'Health', 'Politics', 'Technology', 'Sports'], 'scores': [0.7598903179168701, 0.06476442515850067, 0.06253916025161743, 0.06204197555780411, 0.05076424032449722]}


In [26]:
text = input("Enter the sentence: ")
res = clf(text, labels)

Enter the sentence: The AI war is comes to an interesting point at this time


In [27]:
# Print the sequence
print("Sequence:", res['sequence'])

# Print each label and its score line by line
print("Scores:")
for label, score in zip(res['labels'], res['scores']):
  print(f"- {label}: {score:.4f}")

Sequence: The AI war is comes to an interesting point at this time
Scores:
- Technology: 0.8619
- Health: 0.0415
- Finance: 0.0364
- Sports: 0.0321
- Politics: 0.0281


In [28]:
## TRANSLATION MODELS FROM ENGLISH TO HINDI

mod = pipeline('text2text-generation', model = 'barghavani/English_to_Hindi')

Device set to use cpu


In [29]:
output = mod("The weather is very good today. I think i should make a tea with some pakodas")
print(output)

[{'generated_text': 'आज मौसम बहुत अच्छा है। मुझे लगता है कि मैं कुछ पैसे के साथ चाय बनाना चाहिए'}]


In [30]:
output = mod(input("Enter the sentence: "))
print(output)

Enter the sentence: The AI war is comes to an interesting point at this time
[{'generated_text': 'इस समय इस समय एआई युद्ध एक रोचक बिंदु पर आता है।'}]


In [32]:
from google.colab import output
output.clear()