<a href="https://colab.research.google.com/github/uumair327/natural_language_processing/blob/main/NLP03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



*   PorterStemmer
*   Lancaster
*   WordNetLemmatizer
*   Spacy




**Stop Word Removal**

In [8]:
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

In [9]:
example_sent = """This is a sample sentence, showing off the stop words filtration."""
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []

In [10]:
for w in word_tokens:
  if w not in stop_words:
    filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [13]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
stemmed_sentence = [porter.stem(word) for word in filtered_sentence]
print(stemmed_sentence)

['thi', 'sampl', 'sentenc', ',', 'show', 'stop', 'word', 'filtrat', '.']


**Lemma**

In [15]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_sentence = [lemmatizer.lemmatize(word) for word in stemmed_sentence]
print(lemmatized_sentence)

['thi', 'sampl', 'sentenc', ',', 'show', 'stop', 'word', 'filtrat', '.']


**Lab Manual Example**

In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Example sentence
example_sent = "This is a sample sentence, showing off the stop words filtration."

# Get English stop words
stop_words = set(stopwords.words('english'))

# Tokenize the sentence
word_tokens = word_tokenize(example_sent)

# Remove stop words
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

print("Original Sentence:")
print(word_tokens)
print("\nFiltered Sentence:")
print(filtered_sentence)


Original Sentence:
['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']

Filtered Sentence:
['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [17]:
from nltk.stem import WordNetLemmatizer

# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize words
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
print("better :", lemmatizer.lemmatize("better", pos='a'))  # 'pos' denotes part of speech


rocks : rock
corpora : corpus
better : good


In [18]:
from nltk.stem import PorterStemmer

# Create a stemmer
stemmer = PorterStemmer()

# Stem words
print("running :", stemmer.stem("running"))
print("flies :", stemmer.stem("flies"))
print("fishing :", stemmer.stem("fishing"))


running : run
flies : fli
fishing : fish


**Lancaster**

In [19]:
import nltk
from nltk.stem import LancasterStemmer

# Initialize the Lancaster Stemmer
lancaster_stemmer = LancasterStemmer()

# List of words to stem
words = ["running", "flies", "fishing", "easily", "happily"]

# Apply stemming to each word
stemmed_words = [lancaster_stemmer.stem(word) for word in words]

print("Original Words:", words)
print("Stemmed Words:", stemmed_words)


Original Words: ['running', 'flies', 'fishing', 'easily', 'happily']
Stemmed Words: ['run', 'fli', 'fish', 'easy', 'happy']


**Example using Spacy**

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
import spacy

nltk.download('punkt')
nltk.download('stopwords')

# Initialize Spacy model
nlp = spacy.load("en_core_web_sm")

# Example sentence
example_sent = "This is a sample sentence, showing off the stop words filtration."

# Get English stop words
stop_words = set(stopwords.words('english'))

# Tokenize the sentence
word_tokens = word_tokenize(example_sent)

# Remove stop words
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
print("Original Sentence:")
print(word_tokens)
print("\nFiltered Sentence:")
print(filtered_sentence)

Original Sentence:
['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']

Filtered Sentence:
['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


**Stemming with Porter Stemmer**

In [25]:
porter = PorterStemmer()
stemmed_sentence = [porter.stem(word) for word in filtered_sentence]

print("\nStemmed Sentence (Porter):")
print(stemmed_sentence)


Stemmed Sentence (Porter):
['sampl', 'sentenc', ',', 'show', 'stop', 'word', 'filtrat', '.']


**Lemmatization with Spacy**

In [26]:
doc = nlp(" ".join(filtered_sentence))
lemmatized_sentence_spacy = [token.lemma_ for token in doc]

print("\nLemmatized Sentence (Spacy):")
print(lemmatized_sentence_spacy)




Lemmatized Sentence (Spacy):
['sample', 'sentence', ',', 'show', 'stop', 'word', 'filtration', '.']


**Stemming with Lancaster Stemmer**

In [27]:
lancaster_stemmer = LancasterStemmer()
words_to_stem = ["running", "flies", "fishing", "easily", "happily"]
stemmed_words_lancaster = [lancaster_stemmer.stem(word) for word in words_to_stem]

print("\nLancaster Stemmer Examples:")
print("Original Words:", words_to_stem)
print("Stemmed Words:", stemmed_words_lancaster)


Lancaster Stemmer Examples:
Original Words: ['running', 'flies', 'fishing', 'easily', 'happily']
Stemmed Words: ['run', 'fli', 'fish', 'easy', 'happy']


**Spacy for lemmatization**

In [28]:
additional_words = ["rocks", "corpora", "better"]
doc = nlp(" ".join(additional_words))
lemmatized_additional_words = [token.lemma_ for token in doc]

print("\nAdditional Lemmatization Examples (Spacy):")
print("Original Words:", additional_words)
print("Lemmatized Words:", lemmatized_additional_words)


Additional Lemmatization Examples (Spacy):
Original Words: ['rocks', 'corpora', 'better']
Lemmatized Words: ['rocks', 'corpora', 'well']
