In [20]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

text = """This is the first class of Natural Language Processing.
I like the Natural Language Processing class.
I am interested to learn Natural Language Models."""

# Corpus, Documents, Word, Vocabulary

In [6]:
corpus = text
print("Corpus:", corpus)

Corpus: This is the first class of Natural Language Processing.
I like the Natural Language Processing class.
I am interested to learn Natural Language Models.


In [7]:
documents = sent_tokenize(text)
print("Documents:", documents)

Documents: ['This is the first class of Natural Language Processing.', 'I like the Natural Language Processing class.', 'I am interested to learn Natural Language Models.']


In [8]:
words = [word_tokenize(doc) for doc in documents]
print("Words (tokenized for each document):", words)

Words (tokenized for each document): [['This', 'is', 'the', 'first', 'class', 'of', 'Natural', 'Language', 'Processing', '.'], ['I', 'like', 'the', 'Natural', 'Language', 'Processing', 'class', '.'], ['I', 'am', 'interested', 'to', 'learn', 'Natural', 'Language', 'Models', '.']]


In [9]:
vocabulary = set(word.lower() for doc in words for word in doc)
print("Vocabulary:", vocabulary)

Vocabulary: {'.', 'first', 'to', 'class', 'this', 'processing', 'i', 'is', 'the', 'interested', 'learn', 'models', 'of', 'language', 'like', 'natural', 'am'}


# Text Representation: One hot Encoding

In [12]:
tokens = word_tokenize(text.lower())
print('Tokens: ', tokens)

Tokens:  ['this', 'is', 'the', 'first', 'class', 'of', 'natural', 'language', 'processing', '.', 'i', 'like', 'the', 'natural', 'language', 'processing', 'class', '.', 'i', 'am', 'interested', 'to', 'learn', 'natural', 'language', 'models', '.']


In [13]:
vocabulary = sorted(set(tokens)) 
print("Vocabulary:", vocabulary)

Vocabulary: ['.', 'am', 'class', 'first', 'i', 'interested', 'is', 'language', 'learn', 'like', 'models', 'natural', 'of', 'processing', 'the', 'this', 'to']


In [14]:
word_to_index = {word: i for i, word in enumerate(vocabulary)}
print("Word-to-Index Mapping:", word_to_index)

Word-to-Index Mapping: {'.': 0, 'am': 1, 'class': 2, 'first': 3, 'i': 4, 'interested': 5, 'is': 6, 'language': 7, 'learn': 8, 'like': 9, 'models': 10, 'natural': 11, 'of': 12, 'processing': 13, 'the': 14, 'this': 15, 'to': 16}


In [19]:
one_hot_encoded = {}
for word in vocabulary:
    vector = np.zeros(len(vocabulary), dtype=int)
    vector[word_to_index[word]] = 1
    one_hot_encoded[word] = vector

print("One-Hot Encoded Vectors:")
one_hot_encoded

One-Hot Encoded Vectors:


{'.': array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'am': array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'class': array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'first': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'i': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'interested': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'is': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'language': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'learn': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 'like': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 'models': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 'natural': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 'of': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 'processing': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'the': array([0, 0, 0

# Text Tokenization

In [23]:
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)

Sentence Tokenization: ['This is the first class of Natural Language Processing.', 'I like the Natural Language Processing class.', 'I am interested to learn Natural Language Models.']


In [51]:
print("Word Tokenization:")
for sentence in sentences:
    words = word_tokenize(sentence)
    print("Sentence:", sentence)
    print("Words:", words)

Word Tokenization:
Sentence: This is the first class of Natural Language Processing.
Words: ['This', 'is', 'the', 'first', 'class', 'of', 'Natural', 'Language', 'Processing', '.']
Sentence: I like the Natural Language Processing class.
Words: ['I', 'like', 'the', 'Natural', 'Language', 'Processing', 'class', '.']
Sentence: I am interested to learn Natural Language Models.
Words: ['I', 'am', 'interested', 'to', 'learn', 'Natural', 'Language', 'Models', '.']


# Stemming and Lemmatization

In [26]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [31]:
words = set(word_tokenize(text))
print("Original Words:", words)

Original Words: {'Natural', 'first', 'Processing', 'class', '.', 'to', 'am', 'is', 'the', 'Language', 'interested', 'learn', 'of', 'Models', 'like', 'This', 'I'}


In [32]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]

print("Stemmed Words:", stemmed_words)

Stemmed Words: ['natur', 'first', 'process', 'class', '.', 'to', 'am', 'is', 'the', 'languag', 'interest', 'learn', 'of', 'model', 'like', 'thi', 'i']


In [33]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]

print("Lemmatized Words:")
print(lemmatized_words)

Lemmatized Words:
['natural', 'first', 'processing', 'class', '.', 'to', 'am', 'is', 'the', 'language', 'interested', 'learn', 'of', 'model', 'like', 'this', 'i']


# Edit Distance

In [38]:
from nltk.metrics import edit_distance

In [39]:
documents = sent_tokenize(text)
print("Documents:", documents)

Documents: ['This is the first class of Natural Language Processing.', 'I like the Natural Language Processing class.', 'I am interested to learn Natural Language Models.']


In [40]:
print("Edit Distance Between Sentences:")
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        dist = edit_distance(documents[i], documents[j])
        print(f"Edit distance between:\n  '{documents[i]}'\n  '{documents[j]}'\n  -> {dist}\n")

Edit Distance Between Sentences:
Edit distance between:
  'This is the first class of Natural Language Processing.'
  'I like the Natural Language Processing class.'
  -> 28

Edit distance between:
  'This is the first class of Natural Language Processing.'
  'I am interested to learn Natural Language Models.'
  -> 28

Edit distance between:
  'I like the Natural Language Processing class.'
  'I am interested to learn Natural Language Models.'
  -> 29



# N-gram model

In [55]:
from nltk import ngrams, word_tokenize

In [56]:
words = set(word_tokenize(text))
words

{'.',
 'I',
 'Language',
 'Models',
 'Natural',
 'Processing',
 'This',
 'am',
 'class',
 'first',
 'interested',
 'is',
 'learn',
 'like',
 'of',
 'the',
 'to'}

In [57]:
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

In [61]:
unigrams = generate_ngrams(words, 1)
print("Unigrams:", unigrams)

Unigrams: [('Natural',), ('first',), ('Processing',), ('class',), ('.',), ('to',), ('am',), ('is',), ('the',), ('Language',), ('interested',), ('learn',), ('of',), ('Models',), ('like',), ('This',), ('I',)]


In [59]:
bigrams = generate_ngrams(words, 2)
print("Bigrams:", bigrams)

Bigrams: [('Natural', 'first'), ('first', 'Processing'), ('Processing', 'class'), ('class', '.'), ('.', 'to'), ('to', 'am'), ('am', 'is'), ('is', 'the'), ('the', 'Language'), ('Language', 'interested'), ('interested', 'learn'), ('learn', 'of'), ('of', 'Models'), ('Models', 'like'), ('like', 'This'), ('This', 'I')]


In [60]:
trigrams = generate_ngrams(words, 3)
print("Trigrams:", trigrams)

Trigrams: [('Natural', 'first', 'Processing'), ('first', 'Processing', 'class'), ('Processing', 'class', '.'), ('class', '.', 'to'), ('.', 'to', 'am'), ('to', 'am', 'is'), ('am', 'is', 'the'), ('is', 'the', 'Language'), ('the', 'Language', 'interested'), ('Language', 'interested', 'learn'), ('interested', 'learn', 'of'), ('learn', 'of', 'Models'), ('of', 'Models', 'like'), ('Models', 'like', 'This'), ('like', 'This', 'I')]
