Write a Python program to load a text file, perform tokenization, calculate the term frequency (TF) of each token, and display the top 5 most frequent tokens.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import string

# Function to calculate term frequency (TF)
def calculate_tf(tokens):
    # Create a Counter object to count the frequency of each token
    return Counter(tokens)

# Function to read a text file and perform tokenization
def process_text(file_path):
    try:
        # Read the text file
        with open(file_path, 'r') as file:
            text = file.read()
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
        return []

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    return tokens

def main():
    # Path to the text file
    file_path = "sample.txt"  # Replace with your file path

    # Process the text and tokenize
    tokens = process_text(file_path)

    if tokens:
        # Calculate term frequency
        tf = calculate_tf(tokens)

        # Display the top 5 most frequent tokens
        print("Top 5 most frequent tokens:")
        for token, frequency in tf.most_common(5):
            print(f"{token}: {frequency}")

if __name__ == "__main__":
    main()


Error: File not found. Please check the file path.


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Function to load text from a file
def load_text(file_path):
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
        return ""

# Function to perform tokenization and calculate term frequency
def calculate_tf(text):
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', use_idf=False)

    # Fit and transform the text into a matrix of term frequencies
    tf_matrix = vectorizer.fit_transform([text])

    # Get the terms (tokens) and their corresponding term frequencies
    terms = vectorizer.get_feature_names_out()
    tf_values = tf_matrix.toarray().flatten()

    # Create a list of tuples containing (term, frequency)
    term_freq = list(zip(terms, tf_values))

    # Sort the terms by frequency in descending order and return top 5
    sorted_term_freq = sorted(term_freq, key=lambda x: x[1], reverse=True)
    return sorted_term_freq[:5]

def main():
    # Path to the text file
    file_path = "sample.txt"  # Replace with your file path

    # Load the text from the file
    text = load_text(file_path)

    if text:
        # Calculate and display the top 5 most frequent tokens
        top_tokens = calculate_tf(text)
        
        print("Top 5 most frequent tokens:")
        for token, frequency in top_tokens:
            print(f"{token}: {frequency}")

if __name__ == "__main__":
    main()


Error: File not found. Please check the file path.
