In [8]:
import os
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk


nltk.download('punkt')
nltk.download('wordnet')


def preprocess_text(file_path):
    if not os.path.exists(file_path):
        print(f"File {file_path} not found.")
        return
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    print("Original Text:\n", text)
    text = remove_stopwords(text)
    print("\nText after removing stopwords:\n", text)
    tokens = word_tokenize(text)
    print("\nTokenized Text:\n", tokens)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    print("\nStemmed Tokens:\n", stemmed_tokens)


    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    print("\nLemmatized Tokens:\n", lemmatized_tokens)
    return lemmatized_tokens


file_name = "sample.txt"
if not os.path.exists(file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write("""Gensim is a robust library for unsupervised topic modeling and natural language processing.
        It is user-friendly and has efficient implementations.""")

    print(f"Sample text file '{file_name}' created.")
processed_tokens = preprocess_text(file_name)
if processed_tokens:
    print("\nFinal Processed Tokens:\n", processed_tokens)


Original Text:
 Gensim is a robust library for unsupervised topic modeling and natural language processing.
    It is user-friendly and has efficient implementations.

Text after removing stopwords:
 Gensim robust library unsupervised topic modeling natural language processing. It user-friendly efficient implementations.

Tokenized Text:
 ['Gensim', 'robust', 'library', 'unsupervised', 'topic', 'modeling', 'natural', 'language', 'processing', '.', 'It', 'user-friendly', 'efficient', 'implementations', '.']

Stemmed Tokens:
 ['gensim', 'robust', 'librari', 'unsupervis', 'topic', 'model', 'natur', 'languag', 'process', '.', 'it', 'user-friendli', 'effici', 'implement', '.']

Lemmatized Tokens:
 ['Gensim', 'robust', 'library', 'unsupervised', 'topic', 'modeling', 'natural', 'language', 'processing', '.', 'It', 'user-friendly', 'efficient', 'implementation', '.']

Final Processed Tokens:
 ['Gensim', 'robust', 'library', 'unsupervised', 'topic', 'modeling', 'natural', 'language', 'processin

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
