In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

# Ensure the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    return stemmed_tokens

# Function to generate representative document
def generate_representative_doc(text):
    # Preprocess the text
    tokens = preprocess_text(text)
    
    # Get the frequency of each token
    freq_dist = Counter(tokens)
    
    # Sort by frequency and select the most frequent terms
    sorted_terms = sorted(freq_dist.items(), key=lambda item: item[1], reverse=True)
    
    # Create a representative document from the top terms
    representative_doc = ' '.join([term for term, freq in sorted_terms[:50]])  # Adjust the number of terms as needed
    
    return representative_doc

# Main function
def main():
    # Read the text file
    with open('Lab_1.txt', 'r') as file:
        text = file.read()
    
    # Generate the representative document
    representative_doc = generate_representative_doc(text)
    
    # Output the result
    print("Representative Document:")
    print(representative_doc)
    
    # Optionally, save the output to a file
    with open('output_representative_doc.txt', 'w') as output_file:
        output_file.write(representative_doc)

# Run the main function
if __name__ == '__main__':
    main()


Representative Document:
learn machin use data predict tool make autom appli healthcar financ market detect field artifici intellig focus algorithm imit way human power identifi pattern process wide rang area transport patient outcom diseas person treatment plan fraud trade invest decis also target advertis segment custom sale trend rise big becom essenti


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ameyp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ameyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
