In [None]:
import nltk
import numpy as np
nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


text = """This is the first class of natural language processing.
I like the Natural language processing.
I am interested to learn natural language processing."""


sentences = sent_tokenize(text)
print("Sentence Tokenization:")
print(sentences)

tokens = word_tokenize(text)
print("\nWord Tokenization:")
print(tokens)


Sentence Tokenization:
['This is the first class of natural language processing.', 'I like the Natural language processing.', 'I am interested to learn natural language processing.']

Word Tokenization:
['This', 'is', 'the', 'first', 'class', 'of', 'natural', 'language', 'processing', '.', 'I', 'like', 'the', 'Natural', 'language', 'processing', '.', 'I', 'am', 'interested', 'to', 'learn', 'natural', 'language', 'processing', '.']


#POS Tagging

In [None]:
import nltk

nltk.download('averaged_perceptron_tagger_eng')

sentences = [
    "Mary Jane can see Will",
    "Spot will see Marry",
    "Marry will pat Spot"
]

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    print(pos_tags)
import nltk
from nltk.probability import ConditionalFreqDist, ConditionalProbDist, MLEProbDist

corpus = [
    [("Mary", "N"), ("Jane", "N"), ("can", "M"), ("see", "V"), ("Will", "N")],
    [("Spot", "N"), ("will", "M"), ("see", "V"), ("Marry", "N")],
    [("Marry", "N"), ("will", "M"), ("pat", "V"), ("Spot", "N")]
]

tags = ["<s>"]
for sentence in corpus:
    tags.extend([tag for _, tag in sentence])
    tags.append("</s>")

# print(tags)

bigram_tags = list(nltk.bigrams(tags))
transition_freq = ConditionalFreqDist(bigram_tags)
transition_prob = ConditionalProbDist(transition_freq, MLEProbDist)


emission_freq = ConditionalFreqDist((tag, word) for sentence in corpus for word, tag in sentence)
emission_prob = ConditionalProbDist(emission_freq, MLEProbDist)


print("\nTransition Probabilities:")
for prev_tag in transition_prob.conditions():
    for next_tag in transition_prob[prev_tag].samples():
        print(f"P({next_tag} | {prev_tag}) = {transition_prob[prev_tag].prob(next_tag):.4f}")


print("\nEmission Probabilities:")
for tag in emission_prob.conditions():
    for word in emission_prob[tag].samples():
        print(f"P({word} | {tag}) = {emission_prob[tag].prob(word):.4f}")


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('Mary', 'NNP'), ('Jane', 'NNP'), ('can', 'MD'), ('see', 'VB'), ('Will', 'MD')]
[('Spot', 'NN'), ('will', 'MD'), ('see', 'VB'), ('Marry', 'NNP')]
[('Marry', 'NNP'), ('will', 'MD'), ('pat', 'VB'), ('Spot', 'NNP')]

Transition Probabilities:
P(N | <s>) = 1.0000
P(N | N) = 0.1429
P(M | N) = 0.4286
P(</s> | N) = 0.4286
P(V | M) = 1.0000
P(N | V) = 1.0000
P(N | </s>) = 1.0000

Emission Probabilities:
P(Mary | N) = 0.1429
P(Jane | N) = 0.1429
P(Will | N) = 0.1429
P(Spot | N) = 0.2857
P(Marry | N) = 0.2857
P(can | M) = 0.3333
P(will | M) = 0.6667
P(see | V) = 0.6667
P(pat | V) = 0.3333


#Parsing

In [None]:
import nltk
from nltk import RegexpParser


def pos_tagging(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    return pos_tags


def parse_sentence(pos_tags):
    grammar = r"""
      NP: {<DT>?<JJ>*<NN.*>}   # Noun phrase
      VP: {<VB.*><RB>?<VB.*>*}  # Verb phrase
      PP: {<IN><NP>}            # Prepositional phrase
    """
    parser = RegexpParser(grammar)

    parsed = parser.parse(pos_tags)
    return parsed

input_text = "Time flies like an arrow; fruit flies like bananas"
tags = pos_tagging(input_text)

print("\nPOS Tags:")
for word, tag in tags:
    print(f"{word}: {tag}")

parsed_tree = parse_sentence(tags)
print("\nParsed Sentence:")
parsed_tree.pretty_print()


POS Tags:
Time: NNP
flies: NNS
like: IN
an: DT
arrow: NN
;: :
fruit: CC
flies: NNS
like: IN
bananas: NNS

Parsed Sentence:
                                   S                                                        
  _________________________________|__________________________________________               
 |     |        |         |               PP                   |              PP            
 |     |        |         |         ______|____                |         _____|_______       
 |     |        NP        NP       |           NP              NP       |             NP    
 |     |        |         |        |       ____|_____          |        |             |      
;/: fruit/CC Time/NNP flies/NNS like/IN an/DT     arrow/NN flies/NNS like/IN     bananas/NNS



In [None]:
import nltk
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np



def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])
    return summary

text = """Natural Language Processing (NLP) is a field of AI that focuses on the interaction between
computers and humans using natural language. The ultimate goal of NLP is to enable computers to understand,
interpret, and generate human language in a way that is meaningful. NLP techniques are used in machine
translation, chatbots, sentiment analysis, and text summarization. Researchers are working to improve NLP
models using deep learning and transformers. The advancements in NLP have made a significant impact in various
industries, making human-computer interactions more seamless and intelligent."""
print(extractive_summary(text))


The ultimate goal of NLP is to enable computers to understand, 
interpret, and generate human language in a way that is meaningful. Natural Language Processing (NLP) is a field of AI that focuses on the interaction between 
computers and humans using natural language. Researchers are working to improve NLP 
models using deep learning and transformers.
