In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk, bigrams, FreqDist

In [2]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is a

True

In [3]:
# Sample text corpus with clear named entities
text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

In [4]:
# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentences:\n", sentences, "\n")

# Word Tokenization
words = [w.lower() for w in word_tokenize(text) if w.isalpha()]
print("Words:\n", words, "\n")

# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stems = [stemmer.stem(w) for w in words]
lemmas = [lemmatizer.lemmatize(w) for w in words]
print("Stemming:\n", stems, "\n")
print("Lemmatization:\n", lemmas, "\n")

Sentences:
 ['The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014.', 'India thus became the first country to enter Mars orbit on its first attempt.', 'It was completed at a record low cost of $74 million.'] 

Words:
 ['the', 'mars', 'orbiter', 'mission', 'mom', 'informally', 'known', 'as', 'mangalyaan', 'was', 'launched', 'into', 'earth', 'orbit', 'on', 'november', 'by', 'the', 'indian', 'space', 'research', 'organisation', 'isro', 'and', 'has', 'entered', 'mars', 'orbit', 'on', 'september', 'india', 'thus', 'became', 'the', 'first', 'country', 'to', 'enter', 'mars', 'orbit', 'on', 'its', 'first', 'attempt', 'it', 'was', 'completed', 'at', 'a', 'record', 'low', 'cost', 'of', 'million'] 

Stemming:
 ['the', 'mar', 'orbit', 'mission', 'mom', 'inform', 'known', 'as', 'mangalyaan', 'wa', 'launch', 'into', 'earth', 'orbit', 'on', 'nove

In [5]:
# POS Tagging
pos_tags = pos_tag(words)
print("POS Tags:\n", pos_tags, "\n")

POS Tags:
 [('the', 'DT'), ('mars', 'NNS'), ('orbiter', 'VBP'), ('mission', 'NN'), ('mom', 'NN'), ('informally', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('mangalyaan', 'NN'), ('was', 'VBD'), ('launched', 'VBN'), ('into', 'IN'), ('earth', 'JJ'), ('orbit', 'NN'), ('on', 'IN'), ('november', 'NN'), ('by', 'IN'), ('the', 'DT'), ('indian', 'JJ'), ('space', 'NN'), ('research', 'NN'), ('organisation', 'NN'), ('isro', 'NN'), ('and', 'CC'), ('has', 'VBZ'), ('entered', 'VBN'), ('mars', 'NNS'), ('orbit', 'VBP'), ('on', 'IN'), ('september', 'NN'), ('india', 'JJ'), ('thus', 'RB'), ('became', 'VBD'), ('the', 'DT'), ('first', 'JJ'), ('country', 'NN'), ('to', 'TO'), ('enter', 'VB'), ('mars', 'NNS'), ('orbit', 'RB'), ('on', 'IN'), ('its', 'PRP$'), ('first', 'JJ'), ('attempt', 'NN'), ('it', 'PRP'), ('was', 'VBD'), ('completed', 'VBN'), ('at', 'IN'), ('a', 'DT'), ('record', 'NN'), ('low', 'JJ'), ('cost', 'NN'), ('of', 'IN'), ('million', 'CD')] 



In [6]:
# Named Entity Recognition
import spacy
from spacy import displacy
# Load the English language model
spacy_model = spacy.load("en_core_web_sm")

doc = spacy_model(text)
# Iterate over entities and print their text and label
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: The Mars Orbiter Mission, Label: ORG
Entity: Mangalyaan, Label: PERSON
Entity: Earth, Label: LOC
Entity: 5 November 2013, Label: DATE
Entity: the Indian Space Research Organisation, Label: ORG
Entity: Mars, Label: LOC
Entity: 24 September 2014, Label: DATE
Entity: India, Label: GPE
Entity: first, Label: ORDINAL
Entity: Mars, Label: LOC
Entity: first, Label: ORDINAL
Entity: $74 million, Label: MONEY


In [7]:
# Generate bigrams
bigrams_list = list(bigrams(words))
print(bigrams_list)

# Count bigrams
bigram_freq = FreqDist(bigrams_list)
print(bigram_freq.items())

[('the', 'mars'), ('mars', 'orbiter'), ('orbiter', 'mission'), ('mission', 'mom'), ('mom', 'informally'), ('informally', 'known'), ('known', 'as'), ('as', 'mangalyaan'), ('mangalyaan', 'was'), ('was', 'launched'), ('launched', 'into'), ('into', 'earth'), ('earth', 'orbit'), ('orbit', 'on'), ('on', 'november'), ('november', 'by'), ('by', 'the'), ('the', 'indian'), ('indian', 'space'), ('space', 'research'), ('research', 'organisation'), ('organisation', 'isro'), ('isro', 'and'), ('and', 'has'), ('has', 'entered'), ('entered', 'mars'), ('mars', 'orbit'), ('orbit', 'on'), ('on', 'september'), ('september', 'india'), ('india', 'thus'), ('thus', 'became'), ('became', 'the'), ('the', 'first'), ('first', 'country'), ('country', 'to'), ('to', 'enter'), ('enter', 'mars'), ('mars', 'orbit'), ('orbit', 'on'), ('on', 'its'), ('its', 'first'), ('first', 'attempt'), ('attempt', 'it'), ('it', 'was'), ('was', 'completed'), ('completed', 'at'), ('at', 'a'), ('a', 'record'), ('record', 'low'), ('low', '

In [8]:
import json
# Count how many times each first word occurs in bigrams
first_word_counts = {}
for w1, w2 in bigrams_list:
    if w1 in first_word_counts:
        first_word_counts[w1] += 1
    else:
        first_word_counts[w1] = 1

# Calculate bigram probabilities
bigram_prob = {}
for (w1, w2), count in bigram_freq.items():
    prob = count / first_word_counts[w1]
    bigram_prob[(w1, w2)] = prob

print(bigram_prob)

{('the', 'mars'): 0.3333333333333333, ('mars', 'orbiter'): 0.3333333333333333, ('orbiter', 'mission'): 1.0, ('mission', 'mom'): 1.0, ('mom', 'informally'): 1.0, ('informally', 'known'): 1.0, ('known', 'as'): 1.0, ('as', 'mangalyaan'): 1.0, ('mangalyaan', 'was'): 1.0, ('was', 'launched'): 0.5, ('launched', 'into'): 1.0, ('into', 'earth'): 1.0, ('earth', 'orbit'): 1.0, ('orbit', 'on'): 1.0, ('on', 'november'): 0.3333333333333333, ('november', 'by'): 1.0, ('by', 'the'): 1.0, ('the', 'indian'): 0.3333333333333333, ('indian', 'space'): 1.0, ('space', 'research'): 1.0, ('research', 'organisation'): 1.0, ('organisation', 'isro'): 1.0, ('isro', 'and'): 1.0, ('and', 'has'): 1.0, ('has', 'entered'): 1.0, ('entered', 'mars'): 1.0, ('mars', 'orbit'): 0.6666666666666666, ('on', 'september'): 0.3333333333333333, ('september', 'india'): 1.0, ('india', 'thus'): 1.0, ('thus', 'became'): 1.0, ('became', 'the'): 1.0, ('the', 'first'): 0.3333333333333333, ('first', 'country'): 0.5, ('country', 'to'): 1.0,