In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from collections import Counter
import re

In [3]:

# --- NLTK Resource Download ---
# The following lines download the necessary NLTK data.
# You only need to run this once per environment. If you have them downloaded,
# you can comment these lines out.

In [None]:

# --- Helper Function for Part-of-Speech (POS) Tagging ---
def get_wordnet_pos(treebank_tag):
    """
    Maps treebank POS tags to WordNet POS tags.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # By default, return NOUN if the tag is not recognized
        return wordnet.NOUN

# --- Main Function to Process Text ---
def get_root_word_frequencies(text):

    # 1. Clean and Normalize Text
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    lower_text = cleaned_text.lower()

    # 2. Tokenization
    tokens = word_tokenize(lower_text)

    # 3. Part-of-Speech (POS) Tagging
    pos_tagged_tokens = nltk.pos_tag(tokens)

    # 4. Lemmatization (Finding the Root Word)
    lemmatizer = nltk.WordNetLemmatizer()
    root_words = []
    for word, tag in pos_tagged_tokens:
        pos = get_wordnet_pos(tag)
        root_word = lemmatizer.lemmatize(word, pos=pos)
        
        # If the word is a noun or adjective and didn't change,
        # try lemmatizing it again as a verb. This helps find the
        # core concept (e.g., 'interaction' -> 'interact').
        if (pos == wordnet.NOUN or pos == wordnet.ADJ) and root_word == word:
            root_word = lemmatizer.lemmatize(word, pos=wordnet.VERB)

        root_words.append(root_word)
        
    # 5. Frequency Counting
    frequency_counts = Counter(root_words)
    
    return frequency_counts

# --- Example Usage ---
# The script will now ask for user input when the cell is run.
print("Please paste the text you want to analyze below and press Enter.")
user_text = input()

# Get the frequency of root words from the user's text
word_frequencies = get_root_word_frequencies(user_text)

# --- Display Results ---
print("\n--- Root Word Frequencies ---")
print(f"Found {len(word_frequencies)} unique root words.\n")
    
# Sort the results by frequency in descending order for better readability
sorted_frequencies = word_frequencies.most_common()

for word, count in sorted_frequencies:
    print(f"- '{word}': {count}")


Please paste the text you want to analyze below and press Enter.


 They are considered different base words by the dictionary. While conceptually related, forcing the script to change a noun into its adjective form is outside the scope of standard lemmatization and could cause other words to be converted incorrectly. For this project, sticking to the dictionary root is the most reliable approach.


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\vipra/nltk_data'
    - 'C:\\Users\\vipra\\anaconda3\\nltk_data'
    - 'C:\\Users\\vipra\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\vipra\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\vipra\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [15]:
# It's a good practice to import all libraries at the top of the cell
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from collections import Counter
import re

# --- NLTK Resource Download ---
# This is a more robust way to ensure the necessary NLTK data is downloaded.
# It will open a downloader window if the packages are not found.
# If you are in a non-graphical environment, this might hang.
# In that case, run these lines in a separate Python script first.
print("Checking for NLTK resources...")
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
print("NLTK resources are ready.")


# --- Helper Function for Part-of-Speech (POS) Tagging ---
def get_wordnet_pos(treebank_tag):
    """
    Maps treebank POS tags to WordNet POS tags.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # By default, return NOUN if the tag is not recognized
        return wordnet.NOUN

# --- Main Function to Process Text ---
def get_root_word_frequencies(text):

    # 1. Clean and Normalize Text
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    lower_text = cleaned_text.lower()

    # 2. Tokenization
    tokens = word_tokenize(lower_text)

    # 3. Part-of-Speech (POS) Tagging
    pos_tagged_tokens = nltk.pos_tag(tokens)

    # 4. Lemmatization (Finding the Root Word)
    lemmatizer = nltk.WordNetLemmatizer()
    root_words = []
    for word, tag in pos_tagged_tokens:
        pos = get_wordnet_pos(tag)
        root_word = lemmatizer.lemmatize(word, pos=pos)
        
        # If the word is a noun or adjective and didn't change,
        # try lemmatizing it again as a verb. This helps find the
        # core concept (e.g., 'interaction' -> 'interact').
        if (pos == wordnet.NOUN or pos == wordnet.ADJ) and root_word == word:
            root_word = lemmatizer.lemmatize(word, pos=wordnet.VERB)

        root_words.append(root_word)
        
    # 5. Frequency Counting
    frequency_counts = Counter(root_words)
    
    return frequency_counts

# --- Example Usage ---
# The script will now ask for user input when the cell is run.
print("\nPlease paste the text you want to analyze below and press Enter.")
user_text = input()

# Get the frequency of root words from the user's text
word_frequencies = get_root_word_frequencies(user_text)

# --- Display Results ---
print("\n--- Root Word Frequencies ---")
print(f"Found {len(word_frequencies)} unique root words.\n")
    
# Sort the results by frequency in descending order for better readability
sorted_frequencies = word_frequencies.most_common()

for word, count in sorted_frequencies:

    print(f"- '{word}': {count}")


Checking for NLTK resources...
NLTK resources are ready.

Please paste the text you want to analyze below and press Enter.


 The unstructured nature of the scene required a complete reconstruction of the actor's process; his actions and reactions were critical for the construction of a believable interaction, but his tendency for overacting ultimately led to the scene's destruction.



--- Root Word Frequencies ---
Found 28 unique root words.

- 'the': 5
- 'of': 3
- 'scene': 2
- 'a': 2
- 'his': 2
- 'for': 2
- 'unstructured': 1
- 'nature': 1
- 'require': 1
- 'complete': 1
- 'reconstruction': 1
- 'actor': 1
- 'process': 1
- 'action': 1
- 'and': 1
- 'reaction': 1
- 'be': 1
- 'critical': 1
- 'construction': 1
- 'believable': 1
- 'interaction': 1
- 'but': 1
- 'tendency': 1
- 'overact': 1
- 'ultimately': 1
- 'lead': 1
- 'to': 1
- 'destruction': 1


In [19]:
# It's a good practice to import all libraries at the top of the cell
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer # Importing the Porter Stemmer
from collections import Counter
import re

# --- NLTK Resource Download ---
# This ensures the 'punkt' tokenizer is available.
print("Checking for NLTK resources...")
nltk.download('punkt', quiet=True)
print("NLTK resources are ready.")


# --- Main Function to Process Text using Stemming ---
def get_stemmed_word_frequencies(text):


    # 1. Clean and Normalize Text
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    lower_text = cleaned_text.lower()

    # 2. Tokenization
    tokens = word_tokenize(lower_text)

    # 3. Stemming (Finding the Root Stem)
    # This is more aggressive than lemmatization and will chop off
    # prefixes and suffixes.
    stemmer = PorterStemmer()
    stemmed_words = []
    for word in tokens:
        stem = stemmer.stem(word)
        stemmed_words.append(stem)
        
    # 4. Frequency Counting
    frequency_counts = Counter(stemmed_words)
    
    return frequency_counts

# --- Example Usage ---
# The script will now ask for user input when the cell is run.
print("\nPlease paste the text you want to analyze below and press Enter.")
user_text = input()

# Get the frequency of root words from the user's text
word_frequencies = get_stemmed_word_frequencies(user_text)

# --- Display Results ---
print("\n--- Stemmed Word Frequencies ---")
print(f"Found {len(word_frequencies)} unique word stems.\n")
    
# Sort the results by frequency in descending order for better readability
sorted_frequencies = word_frequencies.most_common()

for word, count in sorted_frequencies:
    print(f"- '{word}': {count}")


Checking for NLTK resources...
NLTK resources are ready.

Please paste the text you want to analyze below and press Enter.


 The disagreeable hater's deep-seated hatred was not well understood by the loving, lovable child, who only loves everyone. For true understanding, one must have understood the core disagreement, as simply hating is not an agreeable solution. The final compilation required a complete recompilation of all previously compiled modules



--- Stemmed Word Frequencies ---
Found 41 unique word stems.

- 'the': 4
- 'not': 2
- 'understood': 2
- 'love': 2
- 'compil': 2
- 'disagre': 1
- 'hater': 1
- 'deepseat': 1
- 'hatr': 1
- 'wa': 1
- 'well': 1
- 'by': 1
- 'lovabl': 1
- 'child': 1
- 'who': 1
- 'onli': 1
- 'everyon': 1
- 'for': 1
- 'true': 1
- 'understand': 1
- 'one': 1
- 'must': 1
- 'have': 1
- 'core': 1
- 'disagr': 1
- 'as': 1
- 'simpli': 1
- 'hate': 1
- 'is': 1
- 'an': 1
- 'agreeabl': 1
- 'solut': 1
- 'final': 1
- 'requir': 1
- 'a': 1
- 'complet': 1
- 'recompil': 1
- 'of': 1
- 'all': 1
- 'previous': 1
- 'modul': 1


In [None]:
# It's a good practice to import all libraries at the top of the cell
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer # Importing the Porter Stemmer
from collections import Counter
import re

# --- NLTK Resource Download ---
# This ensures the 'punkt' tokenizer is available.
print("Checking for NLTK resources...")
nltk.download('punkt', quiet=True)
print("NLTK resources are ready.")


# --- Main Function to Process Text using Stemming ---
def get_stemmed_word_frequencies(text):

    # 1. Clean and Normalize Text
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    lower_text = cleaned_text.lower()

    # 2. Tokenization
    tokens = word_tokenize(lower_text)

    # 3. Stemming (Finding the Root Stem)
    # This is more aggressive than lemmatization and will chop off
    # prefixes and suffixes.
    stemmer = PorterStemmer()
    stemmed_words = []
    for word in tokens:
        stem = stemmer.stem(word)
        stemmed_words.append(stem)
        
    # 4. Frequency Counting
    frequency_counts = Counter(stemmed_words)
    
    return frequency_counts

# --- Example Usage ---
# The script will now ask for user input when the cell is run.
print("\nPlease paste the text you want to analyze below and press Enter.")
user_text = input()

# Get the frequency of root words from the user's text
word_frequencies = get_stemmed_word_frequencies(user_text)

# --- Display Results ---
print("\n--- Stemmed Word Frequencies ---")
print(f"Found {len(word_frequencies)} unique word stems.\n")
    
# Sort the results by frequency in descending order for better readability
sorted_frequencies = word_frequencies.most_common()

for word, count in sorted_frequencies:
    print(f"- '{word}': {count}")


Checking for NLTK resources...
NLTK resources are ready.

Please paste the text you want to analyze below and press Enter.


 You have an incredibly sharp eye! Your observations are spot on, and they highlight the most important characteristic of stemming.  You are correct:  unstructured became unstructur  believable became believ  tendency became tendenc  ultimately became ultim  This is not an error in the code, but rather the exact, expected behavior of the Porter Stemmer algorithm we are using.  Here's why:  A stemmer's only goal is to aggressively chop down words so that related words end up with the exact same stem. It follows a strict set of rules and doesn't care if the final stem is a real dictionary word.  It ensures that believes, believing, and believable all get chopped down to the same root: believ.  It ensures that tendency and tendencies both get chopped down to tendenc.  For the purpose of counting word frequencies, this works perfectly because it groups all variations under one common identifier, even if that identifier looks a bit strange to us.  So, while the output might look "wrong" fro


--- Stemmed Word Frequencies ---
Found 113 unique word stems.

- 'the': 11
- 'believ': 6
- 'a': 6
- 'word': 6
- 'and': 5
- 'of': 5
- 'tendenc': 5
- 'it': 5
- 'stem': 4
- 'becam': 4
- 'to': 4
- 'that': 4
- 'are': 3
- 'is': 3
- 'chop': 3
- 'down': 3
- 'you': 2
- 'an': 2
- 'correct': 2
- 'unstructur': 2
- 'ultim': 2
- 'thi': 2
- 'not': 2
- 'exact': 2
- 'stemmer': 2
- 'algorithm': 2
- 'goal': 2
- 'aggress': 2
- 'so': 2
- 'same': 2
- 'if': 2
- 'real': 2
- 'ensur': 2
- 'all': 2
- 'get': 2
- 'for': 2
- 'group': 2
- 'identifi': 2
- 'look': 2
- 'alway': 2
- 'have': 1
- 'incred': 1
- 'sharp': 1
- 'eye': 1
- 'your': 1
- 'observ': 1
- 'spot': 1
- 'on': 1
- 'they': 1
- 'highlight': 1
- 'most': 1
- 'import': 1
- 'characterist': 1
- 'error': 1
- 'in': 1
- 'code': 1
- 'but': 1
- 'rather': 1
- 'expect': 1
- 'behavior': 1
- 'porter': 1
- 'we': 1
- 'use': 1
- 'here': 1
- 'whi': 1
- 'onli': 1
- 'relat': 1
- 'end': 1
- 'up': 1
- 'with': 1
- 'follow': 1
- 'strict': 1
- 'set': 1
- 'rule': 1
- 'doesnt': 1
- 