In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import string

# Ensure NLTK resources are downloaded
nltk.download('punkt')

def calculate_term_frequency(file_path, top_n=5):
    """
    Loads a text file, performs tokenization, and calculates term frequency.
    
    Parameters:
    file_path (str): The path to the text file.
    top_n (int): The number of top frequent tokens to display.
    
    Returns:
    list: A list of tuples containing the top tokens and their frequencies.
    """
    try:
        # Load the text file
        with open(file_path, 'r') as file:
            text = file.read()

        # Tokenize the text
        tokens = word_tokenize(text)

        # Normalize: Convert to lowercase and remove punctuation
        tokens = [token.lower() for token in tokens if token.isalnum()]

        # Calculate term frequency
        freq_dist = FreqDist(tokens)

        # Get the top n most frequent tokens
        return freq_dist.most_common(top_n)

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []

# Test the function
file_path = 'sample.txt'  # Replace with the path to your text file
top_tokens = calculate_term_frequency(file_path)

# Display the results
print("Top 5 Most Frequent Tokens:")
for token, frequency in top_tokens:
    print(f"{token}: {frequency}")


File not found: sample.txt
Top 5 Most Frequent Tokens:


[nltk_data] Downloading package punkt to C:\Users\SHAIK
[nltk_data]     TOUFIQSAHEB\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
