In [2]:
# Conflation Algorithm: Generate Document Representative of a Text File
# Using Stemming and Lemmatization

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string


# Download resources (run once)
nltk.download('punkt')
nltk.download('punkt_tab')   # <-- ADD THIS LINE
nltk.download('stopwords')
nltk.download('wordnet')


# Step 1: Read input text file
filename = "Conflation.txt"   # <-- use your own text file
with open(filename, 'r', encoding='utf-8') as file:
    text = file.read()

print("Original Text:\n", text)
print("-" * 80)

# Step 2: Tokenization
tokens = word_tokenize(text.lower())

# Step 3: Remove punctuation and stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

# Step 4: Apply Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]

# Step 5: Apply Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

# Step 6: Display results
print("After Stemming:\n", ' '.join(stemmed_words))
print("-" * 80)
print("After Lemmatization:\n", ' '.join(lemmatized_words))
print("-" * 80)

# Step 7: Create Document Representative (word frequency)
from collections import Counter
freq = Counter(stemmed_words)
print("Document Representative (Word Frequency):")
for word, count in freq.most_common(10):
    print(f"{word}: {count}")


Original Text:
 Natural Language Processing is an important field of Computer Science. 
It focuses on enabling computers to understand human language. 
People speak and write in different ways, so words often appear in various forms. 
To analyze text effectively, these word variations need to be reduced to a common base form. 
This process of reducing related words to a single representative form is known as conflation.

--------------------------------------------------------------------------------
After Stemming:
 natur languag process import field comput scienc focus enabl comput understand human languag peopl speak write differ way word often appear variou form analyz text effect word variat need reduc common base form process reduc relat word singl repres form known conflat
--------------------------------------------------------------------------------
After Lemmatization:
 natural language processing important field computer science focus enabling computer understand human lang

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

# Download resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = open("Conflation.txt", "r", encoding="utf-8").read().lower()

# Tokenize and remove stopwords + punctuation
stop = set(stopwords.words("english"))
words = [w for w in word_tokenize(text) if w.isalpha() and w not in stop]

# Stemming
stem = PorterStemmer()
stemmed = [stem.stem(w) for w in words]

# Lemmatization
lemma = WordNetLemmatizer()
lemmatized = [lemma.lemmatize(w) for w in words]

# Output
print("Stemming:\n", " ".join(stemmed))
print("\nLemmatization:\n", " ".join(lemmatized))
print("\nDocument Representative (Top Words):")
for w, c in Counter(stemmed).most_common(10):
    print(w, ":", c)


Stemming:
 natur languag process import field comput scienc focus enabl comput understand human languag peopl speak write differ way word often appear variou form analyz text effect word variat need reduc common base form process reduc relat word singl repres form known conflat

Lemmatization:
 natural language processing important field computer science focus enabling computer understand human language people speak write different way word often appear various form analyze text effectively word variation need reduced common base form process reducing related word single representative form known conflation

Document Representative (Top Words):
word : 3
form : 3
languag : 2
process : 2
comput : 2
reduc : 2
natur : 1
import : 1
field : 1
scienc : 1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# ==============================================
# Conflation Algorithm: Generate Document Representative of a Text File
# Using Stemming and Lemmatization
# ==============================================

# ====== Install Required Libraries ======
!pip install nltk 

# ====== Import Libraries ======
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from collections import Counter

# ====== Download NLTK Resources ======
nltk.download('punkt')
nltk.download('punkt_tab')   # For tokenization tables
nltk.download('stopwords')
nltk.download('wordnet')

# ====== Step 1: Read Input Text File ======
filename = "Conflation.txt"   # <-- use your own text file
with open(filename, 'r', encoding='utf-8') as file:
    text = file.read()

print("Original Text:\n", text)
print("-" * 80)

# ====== Step 2: Tokenization ======
tokens = word_tokenize(text.lower())

# ====== Step 3: Remove Punctuation and Stopwords ======
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

# ====== Step 4: Apply Stemming ======
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]

# ====== Step 5: Apply Lemmatization ======
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

# ====== Step 6: Display Results ======
print("After Stemming:\n", ' '.join(stemmed_words))
print("-" * 80)
print("After Lemmatization:\n", ' '.join(lemmatized_words))
print("-" * 80)

# ====== Step 7: Create Document Representative (Word Frequency) ======
freq = Counter(stemmed_words)
print("Document Representative (Word Frequency):")
for word, count in freq.most_common(10):
    print(f"{word}: {count}")


Original Text:
 Natural Language Processing is an important field of Computer Science. 
It focuses on enabling computers to understand human language. 
People speak and write in different ways, so words often appear in various forms. 
To analyze text effectively, these word variations need to be reduced to a common base form. 
This process of reducing related words to a single representative form is known as conflation.

--------------------------------------------------------------------------------
After Stemming:
 natur languag process import field comput scienc focus enabl comput understand human languag peopl speak write differ way word often appear variou form analyz text effect word variat need reduc common base form process reduc relat word singl repres form known conflat
--------------------------------------------------------------------------------
After Lemmatization:
 natural language processing important field computer science focus enabling computer understand human lang


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vedik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
