In [10]:
# Install the NLTK (Natural Language Toolkit) library
!pip install nltk



In [11]:
# Import the NLTK library
import nltk

# Download the 'punkt' tokenizer models, used for tokenizing text into sentences or words
nltk.download('punkt')

# Download the 'stopwords' corpus, which contains lists of common stopwords for various languages
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords  # Import the stopwords module from NLTK (Natural Language Toolkit)
from nltk.tokenize import word_tokenize  # Import the word_tokenize function from NLTK

# Define a sample text string
sample_text = "This is a sample sentence, showing off the stop words filtration."

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Tokenize the sample text into words
word_tokens = word_tokenize(sample_text)

# Initialize an empty list to store the filtered words
filtered_sentence = []

for w in word_tokens:  # Iterate over each word in the tokenized words
    if w not in stop_words:  # Check if the word is not a stopword
        filtered_sentence.append(w)  # If not a stopword, append the word to the filtered_sentence list

print(word_tokens)  # Print the list of tokenized words
print(filtered_sentence)  # Print the list of words after stopword removal


['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


## **Text Stemming**

In [8]:
from nltk.stem import PorterStemmer  # Import the PorterStemmer class from NLTK for stemming words

from nltk.tokenize import word_tokenize  # Import the word_tokenize function from NLT
import time

porter_stemmer = PorterStemmer()  # Create an instance of the PorterStemmer for stemming words

# Sample text for stemming
sample_text = """The children are playing outside. They have been playing for hours,
                running around the park. Their laughter and running echo through the trees.
                As they play, their playful shouts and screams fill the air.
                Even the dogs are running and playing with them.
                Watching them play brings joy to everyone around."""

token = word_tokenize(sample_text)  # Tokenize the sample text into words

# Perform stemming on each word
stemmed_words = [porter_stemmer.stem(word) for word in token]

stemmed_text = " ".join(stemmed_words)  # Join the stemmed words back into a single string

print(f"Original Text: {sample_text}")  # Print the original text
print(f"Stemmed Text: {stemmed_text}")  # Print the stemmed text

Original Text: The children are playing outside. They have been playing for hours, 
                running around the park. Their laughter and running echo through the trees.
                As they play, their playful shouts and screams fill the air.
                Even the dogs are running and playing with them.
                Watching them play brings joy to everyone around.
Stemmed Text: the children are play outsid . they have been play for hour , run around the park . their laughter and run echo through the tree . as they play , their play shout and scream fill the air . even the dog are run and play with them . watch them play bring joy to everyon around .


##  **Lemmatization**

In [16]:
# Import the NLTK (Natural Language Toolkit) library
import nltk

# Download the 'wordnet' dataset, which is a lexical database for the English language
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
# Import the WordNetLemmatizer class from NLTK for lemmatizing words
from nltk.stem import WordNetLemmatizer

# Import the word_tokenize function from NLTK to tokenize the text
from nltk.tokenize import word_tokenize

# Import the wordnet module from NLTK to provide word sense information for lemmatization
from nltk.corpus import wordnet

# Create an instance of the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define a sample text to be lemmatized
text = "The children are playing outside. They have been playing for hours, running around the park."

# Tokenize the sample text into words
token = word_tokenize(text)

# Apply lemmatization to each word in the tokenized list, using the VERB part-of-speech tag
lemmatized_words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in token]

# Join the lemmatized words back into a single string
lemmatized_text = " ".join(lemmatized_words)

# Print the original text
print(f"Original Text: {text}")

# Print the lemmatized text
print(f"Lemmatized Text: {lemmatized_text}")


Original Text: The children are playing outside. They have been playing for hours, running around the park.
Lemmatized Text: The children be play outside . They have be play for hours , run around the park .


##  **Regex**

In [21]:
# Import the regular expression module
import re

# Define a function to clean the text using regular expressions
def regex_magic(text):
    # Remove all punctuation characters from the text
    text = re.sub(r'[^\w\s]', '', text)
    # Replace all sequences of digits with a space
    text = re.sub(r'\d+', ' ', text)
    return text

# Define a sample text to be processed
text = "Alex, a 25-year-old Sub Engineer (Electrical), graduated in 2024 from the Polytech Institute. His expertise includes AC Generators, Three Phase/Star Delta Connections, and AC/DC Motors."

# Print the original text
print(f"Original Text: {text}")

# Apply the regex_magic function to the text and print the cleaned text
print(f"Regex Magic Text: {regex_magic(text)}")


Original Text: Alex, a 25-year-old Sub Engineer (Electrical), graduated in 2024 from the Polytech Institute. His expertise includes AC Generators, Three Phase/Star Delta Connections, and AC/DC Motors.
Regex Magic Text: Alex a  yearold Sub Engineer Electrical graduated in   from the Polytech Institute His expertise includes AC Generators Three PhaseStar Delta Connections and ACDC Motors
