In [None]:
text = "Natural language Processing is Fun!"
text = text.lower()
text

'natural language processing is fun!'

In [None]:
import string

text = "Hello world! NLP is exciting."
text = text.translate(str.maketrans("", "", string.punctuation))
text # Hello world NLP is exciting

'Hello world NLP is exciting'

In [None]:
!pip install nltk

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

text = "Tokenize this sentence."
tokens = word_tokenize(text)
tokens



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Tokenize', 'this', 'sentence', '.']

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered = [word for word in tokens if word.lower() not in stop_words]
filtered

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['Tokenize', 'sentence', '.']

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["playing", "played", "player"]
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)

['play', 'play', 'player']


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
words = ["playing", "better", "running"]
lemmatized = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmatized)

['playing', 'good', 'running']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


*Part B*

In [None]:
sentence = "Natural Language Processing (NLP) is transforming the way humans interact with machines."

In [None]:
sentence = sentence.lower()
sentence

'natural language processing (nlp) is transforming the way humans interact with machines.'

In [None]:
sentence = sentence.translate(str.maketrans('','', string.punctuation))
sentence

'natural language processing nlp is transforming the way humans interact with machines'

In [None]:
s_tokens = word_tokenize(sentence)
s_tokens

['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'transforming',
 'the',
 'way',
 'humans',
 'interact',
 'with',
 'machines']

In [None]:
stop_words = set(stopwords.words('english'))
stop_words_rid_of_sentence = [word for word in s_tokens if word.lower() not in stop_words]
stop_words_rid_of_sentence

['natural',
 'language',
 'processing',
 'nlp',
 'transforming',
 'way',
 'humans',
 'interact',
 'machines']

In [None]:
stemmer = PorterStemmer()
stemmed_of_sentence = [stemmer.stem(stop_word_rid_of_sentence) for stop_word_rid_of_sentence in stop_words_rid_of_sentence]
stemmed_of_sentence

['natur',
 'languag',
 'process',
 'nlp',
 'transform',
 'way',
 'human',
 'interact',
 'machin']

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_of_sentence = [lemmatizer.lemmatize(stem_of_sentence, pos='v') for stem_of_sentence in stemmed_of_sentence]
lemmatized_of_sentence

['natur',
 'languag',
 'process',
 'nlp',
 'transform',
 'way',
 'human',
 'interact',
 'machin']

In [None]:
text = "The running dogs were quickly chasing the beautiful cats through the gardens..!"

In [None]:
print("=== TOKENIZATION ===")
# Breaking text into individual words (tokens)
tokens = word_tokenize(text)
print(f"Original: {text}")
print(f"Tokens: {tokens}")

=== TOKENIZATION ===
Original: The running dogs were quickly chasing the beautiful cats through the gardens..!
Tokens: ['The', 'running', 'dogs', 'were', 'quickly', 'chasing', 'the', 'beautiful', 'cats', 'through', 'the', 'gardens', '..', '!']


In [None]:
print("=== STOPWORD REMOVAL ===")
# Remove common words that don't carry much meaning
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(f"Stop words: {list(stop_words)[:10]}...")  # Show first 10
print(f"After removing stop words: {filtered_tokens}")

=== STOPWORD REMOVAL ===
Stop words: ["we're", 'while', 'mightn', 'weren', "i'm", 'very', 'can', 'after', "he'll", 'how']...
After removing stop words: ['running', 'dogs', 'quickly', 'chasing', 'beautiful', 'cats', 'gardens', '..', '!']


In [None]:
print("=== STEMMING vs LEMMATIZATION ===")
# Stemming: crude chopping to find root form
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print(f"Stemmed words: {stemmed}")

=== STEMMING vs LEMMATIZATION ===
Stemmed words: ['run', 'dog', 'quickli', 'chase', 'beauti', 'cat', 'garden', '..', '!']


In [None]:
# Lemmatization: intelligent reduction to dictionary form
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word, pos='v') for word in filtered_tokens]
print(f"Lemmatized words: {lemmatized}")

Lemmatized words: ['run', 'dog', 'quickly', 'chase', 'beautiful', 'cat', 'garden', '..', '!']


In [None]:
from nltk.util import ngrams
print("=== N-GRAMS ===")
# Creating sequences of n consecutive words
bigrams = list(ngrams(tokens, 2))  # 2-word combinations
trigrams = list(ngrams(tokens, 3))  # 3-word combinations
print(f"Bigrams (2-word sequences): {bigrams}")
print(f"Trigrams (3-word sequences): {trigrams}")

=== N-GRAMS ===
Bigrams (2-word sequences): [('The', 'running'), ('running', 'dogs'), ('dogs', 'were'), ('were', 'quickly'), ('quickly', 'chasing'), ('chasing', 'the'), ('the', 'beautiful'), ('beautiful', 'cats'), ('cats', 'through'), ('through', 'the'), ('the', 'gardens'), ('gardens', '..'), ('..', '!')]
Trigrams (3-word sequences): [('The', 'running', 'dogs'), ('running', 'dogs', 'were'), ('dogs', 'were', 'quickly'), ('were', 'quickly', 'chasing'), ('quickly', 'chasing', 'the'), ('chasing', 'the', 'beautiful'), ('the', 'beautiful', 'cats'), ('beautiful', 'cats', 'through'), ('cats', 'through', 'the'), ('through', 'the', 'gardens'), ('the', 'gardens', '..'), ('gardens', '..', '!')]


In [None]:
from nltk.metrics.distance import edit_distance
!pip install python-Levenshtein
from Levenshtein import distance

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected package

In [None]:
print("=== STRING DISTANCE METRICS ===")
word1, word2 = "running", "runing"  # Intentional typo
word3 = "walking"

# Edit distance: minimum operations to transform one string to another
nltk_distance = edit_distance(word1, word2)
levenshtein_dist = distance(word1, word2)
different_words_dist = distance(word1, word3)

print(f"Distance between '{word1}' and '{word2}': {nltk_distance} (NLTK)")
print(f"Distance between '{word1}' and '{word2}': {levenshtein_dist} (Levenshtein)")
print(f"Distance between '{word1}' and '{word3}': {different_words_dist}")

=== STRING DISTANCE METRICS ===
Distance between 'running' and 'runing': 1 (NLTK)
Distance between 'running' and 'runing': 1 (Levenshtein)
Distance between 'running' and 'walking': 4


In [None]:
def demonstrate_pos_in_lemmatization():
    """
    Demonstrates how part-of-speech (POS) tags affect lemmatization results.
    This is crucial for understanding why context matters in NLP.
    """

    # Create sample sentences with words that behave differently as different parts of speech
    test_words = [
        ("running", "I am running fast"),           # verb vs adjective/noun
        ("better", "This is better than that"),    # adjective vs adverb
        ("studies", "She studies hard"),           # verb vs noun
        ("leaves", "The tree leaves in autumn"),   # verb vs noun
        ("flies", "Time flies quickly")            # verb vs noun
    ]

    lemmatizer = WordNetLemmatizer()

    print("=== LEMMATIZATION WITHOUT POS (Default Behavior) ===")
    print("When you don't specify POS, WordNet assumes the word is a NOUN by default")
    print("This is like assuming every word you encounter is a person's name!\n")

    for word, context in test_words:
        # Default lemmatization (assumes noun)
        default_result = lemmatizer.lemmatize(word)
        print(f"Word: '{word}' (in context: '{context}')")
        print(f"  Default lemmatization (assumes noun): '{word}' → '{default_result}'")

        # Now try different POS tags to see the difference
        as_noun = lemmatizer.lemmatize(word, pos='n')      # noun
        as_verb = lemmatizer.lemmatize(word, pos='v')      # verb
        as_adj = lemmatizer.lemmatize(word, pos='a')       # adjective
        as_adv = lemmatizer.lemmatize(word, pos='r')       # adverb

        print(f"  As noun (pos='n'):      '{word}' → '{as_noun}'")
        print(f"  As verb (pos='v'):      '{word}' → '{as_verb}'")
        print(f"  As adjective (pos='a'): '{word}' → '{as_adj}'")
        print(f"  As adverb (pos='r'):    '{word}' → '{as_adv}'")
        print()

    print("\n=== WHY THIS MATTERS: A CONCRETE EXAMPLE ===")

    # Show how context completely changes meaning and lemmatization
    sentence1 = "The running water was cold"      # "running" is an adjective here
    sentence2 = "I was running to the store"     # "running" is a verb here

    print(f"Sentence 1: '{sentence1}'")
    print(f"  'running' as adjective: '{lemmatizer.lemmatize('running', pos='a')}'")
    print(f"  'running' as verb: '{lemmatizer.lemmatize('running', pos='v')}'")
    print(f"  'running' without POS: '{lemmatizer.lemmatize('running')}'")

    print(f"\nSentence 2: '{sentence2}'")
    print("In this context, 'running' should be lemmatized as a verb!")
    print(f"  Correct lemmatization (verb): '{lemmatizer.lemmatize('running', pos='v')}'")
    print(f"  Wrong assumption (default): '{lemmatizer.lemmatize('running')}'")

    print("\n=== PRACTICAL IMPLICATIONS ===")

    # Demonstrate how this affects information retrieval
    documents = [
        "The company studies market trends carefully",     # "studies" = verb
        "Recent studies show interesting patterns",        # "studies" = noun
        "She studies computer science at university"       # "studies" = verb
    ]

    print("Consider these documents in a search engine:")
    for i, doc in enumerate(documents, 1):
        tokens = word_tokenize(doc.lower())
        studies_word = "studies"

        # Find the word "studies" in each document
        if studies_word in tokens:
            default_lemma = lemmatizer.lemmatize(studies_word)
            verb_lemma = lemmatizer.lemmatize(studies_word, pos='v')
            noun_lemma = lemmatizer.lemmatize(studies_word, pos='n')

            print(f"\nDocument {i}: '{doc}'")
            print(f"  Default lemmatization: '{studies_word}' → '{default_lemma}'")
            print(f"  As verb: '{studies_word}' → '{verb_lemma}'")
            print(f"  As noun: '{studies_word}' → '{noun_lemma}'")

    print(f"\n{'='*60}")
    print("KEY INSIGHTS:")
    print("• Without POS tags, lemmatization defaults to treating words as nouns")
    print("• This can lead to missed connections between related concepts")
    print("• In information retrieval, you might miss relevant documents")
    print("• For better accuracy, you need POS tagging before lemmatization")
    print("• This is why advanced NLP pipelines always include POS tagging!")

def show_pos_tagging_integration():
    """
    Shows how to properly integrate POS tagging with lemmatization
    """
    print(f"\n{'='*60}")
    print("PROPER INTEGRATION: POS TAGGING + LEMMATIZATION")

    # First, we need to download POS tagger
    nltk.download('averaged_perceptron_tagger', quiet=True)

    text = "The running dogs were quickly studying the flying birds"
    tokens = word_tokenize(text)

    # Get POS tags for each word
    pos_tags = nltk.pos_tag(tokens)

    print(f"Original text: '{text}'")
    print(f"Tokens with POS tags: {pos_tags}")

    # Convert NLTK POS tags to WordNet POS tags
    def get_wordnet_pos(treebank_tag):
        """
        Converts NLTK's POS tags to WordNet's POS tags
        This is like translating between two different classification systems
        """
        if treebank_tag.startswith('J'):
            return 'a'  # adjective
        elif treebank_tag.startswith('V'):
            return 'v'  # verb
        elif treebank_tag.startswith('N'):
            return 'n'  # noun
        elif treebank_tag.startswith('R'):
            return 'r'  # adverb
        else:
            return 'n'  # default to noun

    lemmatizer = WordNetLemmatizer()

    print(f"\nIntelligent lemmatization using POS tags:")
    for word, pos_tag in pos_tags:
        if word.isalpha():  # Only process alphabetic words
            wordnet_pos = get_wordnet_pos(pos_tag)
            lemmatized = lemmatizer.lemmatize(word.lower(), pos=wordnet_pos)

            print(f"  '{word}' (POS: {pos_tag} → {wordnet_pos}) → '{lemmatized}'")

In [None]:
demonstrate_pos_in_lemmatization()

=== LEMMATIZATION WITHOUT POS (Default Behavior) ===
When you don't specify POS, WordNet assumes the word is a NOUN by default
This is like assuming every word you encounter is a person's name!

Word: 'running' (in context: 'I am running fast')
  Default lemmatization (assumes noun): 'running' → 'running'
  As noun (pos='n'):      'running' → 'running'
  As verb (pos='v'):      'running' → 'run'
  As adjective (pos='a'): 'running' → 'running'
  As adverb (pos='r'):    'running' → 'running'

Word: 'better' (in context: 'This is better than that')
  Default lemmatization (assumes noun): 'better' → 'better'
  As noun (pos='n'):      'better' → 'better'
  As verb (pos='v'):      'better' → 'better'
  As adjective (pos='a'): 'better' → 'good'
  As adverb (pos='r'):    'better' → 'well'

Word: 'studies' (in context: 'She studies hard')
  Default lemmatization (assumes noun): 'studies' → 'study'
  As noun (pos='n'):      'studies' → 'study'
  As verb (pos='v'):      'studies' → 'study'
  As a

In [None]:
import string

def explore_punctuation_removal():
    """
    Deep dive into how str.maketrans() and translate() work together
    to remove punctuation from text efficiently.
    """

    text = "Hello world! NLP is exciting."

    print("=== UNDERSTANDING THE COMPONENTS ===\n")

    # First, let's see what string.punctuation contains
    print("1. What is string.punctuation?")
    print(f"   Contents: '{string.punctuation}'")
    print(f"   These are all ASCII punctuation characters")
    print(f"   Length: {len(string.punctuation)} characters\n")

    # Now let's understand str.maketrans()
    print("2. Understanding str.maketrans()")
    print("   This function creates a TRANSLATION TABLE - think of it as a dictionary")
    print("   that tells Python how to map characters from one set to another.\n")

    # The three-argument form: maketrans(from, to, delete)
    # maketrans("", "", string.punctuation) means:
    # - from: "" (empty - don't replace anything)
    # - to: "" (empty - don't replace with anything)
    # - delete: string.punctuation (delete these characters)

    translation_table = str.maketrans("", "", string.punctuation)

    print(f"   Translation table type: {type(translation_table)}")
    print(f"   It's a dictionary that maps character codes to actions")
    print(f"   Sample entries (first 5 punctuation marks):")

    # Show how the translation table works
    for i, char in enumerate(string.punctuation[:5]):
        char_code = ord(char)
        print(f"     '{char}' (code {char_code}) → None (means DELETE)")

    print(f"\n   Total mappings in table: {len(translation_table)}\n")

    # Now let's see translate() in action step by step
    print("3. How translate() applies the translation table")
    print(f"   Original text: '{text}'")
    print(f"   Processing character by character:\n")

    # Manual simulation of what translate() does
    result = ""
    for i, char in enumerate(text):
        char_code = ord(char)
        if char_code in translation_table:
            # Character is in translation table (it's punctuation)
            print(f"     Position {i}: '{char}' → DELETED (found in table)")
        else:
            # Character is not in translation table (keep it)
            result += char
            print(f"     Position {i}: '{char}' → KEPT")

    print(f"\n   Result after manual processing: '{result}'")

    # Now do it with the actual translate method
    cleaned_text = text.translate(translation_table)
    print(f"   Result using translate(): '{cleaned_text}'")
    print(f"   Both methods produce identical results: {result == cleaned_text}\n")

    print("=== WHY USE THIS APPROACH? EFFICIENCY! ===\n")

    # Compare different approaches to removing punctuation
    import time

    test_text = text * 1000  # Repeat text 1000 times

    print("Let's compare three different methods on larger text:\n")

    # Method 1: Using translate (the code you asked about)
    start = time.time()
    result1 = test_text.translate(str.maketrans("", "", string.punctuation))
    time1 = time.time() - start
    print(f"Method 1 - translate(): {time1:.6f} seconds")
    print("   Pros: Fastest, concise, happens at C level in Python")
    print("   Cons: Less intuitive at first glance\n")

    # Method 2: Using list comprehension
    start = time.time()
    result2 = ''.join([char for char in test_text if char not in string.punctuation])
    time2 = time.time() - start
    print(f"Method 2 - list comprehension: {time2:.6f} seconds")
    print("   Pros: More readable, Pythonic")
    print("   Cons: Slower, creates intermediate list\n")

    # Method 3: Using replace in a loop
    start = time.time()
    result3 = test_text
    for punct in string.punctuation:
        result3 = result3.replace(punct, "")
    time3 = time.time() - start
    print(f"Method 3 - multiple replace(): {time3:.6f} seconds")
    print("   Pros: Very intuitive, easy to understand")
    print("   Cons: Slowest, creates many intermediate strings\n")

    print(f"Speed comparison:")
    print(f"   translate() is {time2/time1:.1f}x faster than list comprehension")
    print(f"   translate() is {time3/time1:.1f}x faster than multiple replace()\n")

    # Verify all methods produce the same result
    print(f"All methods produce identical output: {result1 == result2 == result3}\n")

    print("=== ALTERNATIVE FORMS OF str.maketrans() ===\n")

    # str.maketrans() can be used in different ways
    print("The maketrans() method has multiple signatures:\n")

    # One argument: dictionary mapping
    print("1. Dictionary form (most flexible):")
    custom_table = str.maketrans({
        '!': None,      # Delete exclamation marks
        '.': None,      # Delete periods
        'o': '0',       # Replace 'o' with '0'
        'l': '1'        # Replace 'l' with '1'
    })
    print(f"   Input:  '{text}'")
    print(f"   Output: '{text.translate(custom_table)}'")
    print("   Used when you want fine control over specific characters\n")

    # Two arguments: character-by-character replacement
    print("2. Two-string form (character substitution):")
    vowel_table = str.maketrans("aeiou", "43!0*")
    sample = "Hello Python"
    print(f"   Input:  '{sample}'")
    print(f"   Output: '{sample.translate(vowel_table)}'")
    print("   Each character in first string maps to corresponding character in second\n")

    # Three arguments: replace and delete (what you're using)
    print("3. Three-argument form (replace + delete):")
    print("   maketrans(from_chars, to_chars, delete_chars)")
    print("   Your code uses: maketrans('', '', string.punctuation)")
    print("   Meaning: replace nothing, delete all punctuation\n")

    print("=== WHY THIS MATTERS IN NLP ===\n")

    documents = [
        "Hello, world! How are you?",
        "Machine learning is amazing!!!",
        "Natural Language Processing (NLP) transforms text."
    ]

    print("Consider these documents for text analysis:")
    for i, doc in enumerate(documents, 1):
        clean = doc.translate(str.maketrans("", "", string.punctuation))
        tokens = clean.split()
        print(f"\nDocument {i}: '{doc}'")
        print(f"  Cleaned: '{clean}'")
        print(f"  Tokens: {tokens}")
        print(f"  Without punctuation removal, '!' and '?' would be tokens")
        print(f"  This would pollute your vocabulary and skew word frequencies")

    print("\n=== KEY TAKEAWAYS ===")
    print("• str.maketrans() creates a character mapping table (dictionary)")
    print("• translate() applies that table efficiently in one pass")
    print("• This is the fastest way to remove or replace characters in Python")
    print("• Essential for text preprocessing in NLP pipelines")
    print("• Removes noise (punctuation) to focus on meaningful content (words)")

In [None]:
explore_punctuation_removal()

=== UNDERSTANDING THE COMPONENTS ===

1. What is string.punctuation?
   Contents: '!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'
   These are all ASCII punctuation characters
   Length: 32 characters

2. Understanding str.maketrans()
   This function creates a TRANSLATION TABLE - think of it as a dictionary
   that tells Python how to map characters from one set to another.

   Translation table type: <class 'dict'>
   It's a dictionary that maps character codes to actions
   Sample entries (first 5 punctuation marks):
     '!' (code 33) → None (means DELETE)
     '"' (code 34) → None (means DELETE)
     '#' (code 35) → None (means DELETE)
     '$' (code 36) → None (means DELETE)
     '%' (code 37) → None (means DELETE)

   Total mappings in table: 32

3. How translate() applies the translation table
   Original text: 'Hello world! NLP is exciting.'
   Processing character by character:

     Position 0: 'H' → KEPT
     Position 1: 'e' → KEPT
     Position 2: 'l' → KEPT
     Position 3: 'l' → KEPT