In [None]:
# Conflation Algorithm: Generate Document Representative of a Text File
# Using Stemming and Lemmatization

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string


# Download resources (run once)
nltk.download('punkt')
nltk.download('punkt_tab')   # <-- ADD THIS LINE
nltk.download('stopwords')
nltk.download('wordnet')


# Step 1: Read input text file
filename = "Conflation.txt"   # <-- use your own text file
with open(filename, 'r', encoding='utf-8') as file:
    text = file.read()

print("Original Text:\n", text)
print("-" * 80)

# Step 2: Tokenization
tokens = word_tokenize(text.lower())

# Step 3: Remove punctuation and stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

# Step 4: Apply Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]

# Step 5: Apply Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

# Step 6: Display results
print("After Stemming:\n", ' '.join(stemmed_words))
print("-" * 80)
print("After Lemmatization:\n", ' '.join(lemmatized_words))
print("-" * 80)

# Step 7: Create Document Representative (word frequency)
from collections import Counter
freq = Counter(stemmed_words)
print("Document Representative (Word Frequency):")
for word, count in freq.most_common(10):
    print(f"{word}: {count}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text:
 Information retrieval is the process of obtaining information from large collections of text.
It involves searching, indexing, and ranking documents based on user queries.

--------------------------------------------------------------------------------
After Stemming:
 inform retriev process obtain inform larg collect text involv search index rank document base user queri
--------------------------------------------------------------------------------
After Lemmatization:
 information retrieval process obtaining information large collection text involves searching indexing ranking document based user query
--------------------------------------------------------------------------------
Document Representative (Word Frequency):
inform: 2
retriev: 1
process: 1
obtain: 1
larg: 1
collect: 1
text: 1
involv: 1
search: 1
index: 1


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

# Download resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = open("Conflation.txt", "r", encoding="utf-8").read().lower()

# Tokenize and remove stopwords + punctuation
stop = set(stopwords.words("english"))
words = [w for w in word_tokenize(text) if w.isalpha() and w not in stop]

# Stemming
stem = PorterStemmer()
stemmed = [stem.stem(w) for w in words]

# Lemmatization
lemma = WordNetLemmatizer()
lemmatized = [lemma.lemmatize(w) for w in words]

# Output
print("Stemming:\n", " ".join(stemmed))
print("\nLemmatization:\n", " ".join(lemmatized))
print("\nDocument Representative (Top Words):")
for w, c in Counter(stemmed).most_common(10):
    print(w, ":", c)
