## Import NlTK & word tokenizer

In [1]:
import nltk
from nltk.tokenize import word_tokenize , sent_tokenize

## To Lowercase

In [2]:
text = "Hello World! This is some Sample Text."

text.lower()

'hello world! this is some sample text.'

## Stemming

Stemming is the process of reducing a word to its base or root form, such as converting "running" to "run". NLTK provides several stemmers, including the Porter stemmer, which is based on the Porter stemming algorithm.

In [2]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

text = "The boys are running and the leaves are falling."

# Tokenize into words
words = word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

['the', 'boy', 'are', 'run', 'and', 'the', 'leav', 'are', 'fall', '.']


For stemming Arabic text, it’s recommended to use stemmers specifically designed for Arabic, such as the ISRI Stemmer or the Snowball Stemmer with Arabic support.

In [4]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("arabic")

# Stem a word
print(stemmer.stem("العاملون"))

عامل


## Part-of-speech Tagging

Part-of-speech (POS) tagging is the process of labeling the words in a sentence with their corresponding part of speech, such as noun, verb, adjective, etc

In [5]:
from nltk import pos_tag

text = "This is an example sentence, showing off the part-of-speech tagging process."

tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)

tagged_tokens

[('This', 'DT'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('example', 'NN'),
 ('sentence', 'NN'),
 (',', ','),
 ('showing', 'VBG'),
 ('off', 'RP'),
 ('the', 'DT'),
 ('part-of-speech', 'JJ'),
 ('tagging', 'NN'),
 ('process', 'NN'),
 ('.', '.')]

## Lemmatization

Lemmatization is similar to stemming, but it produces a valid word form, known as the lemma, rather than just a root form. For example, "running" would be lemmatized to "run", but "better" would remain unchanged. NLTK provides a lemmatizer, which can be used with the WordNetLemmatizer class which takes into account the part-of-speech tags of the words.

In [6]:
def get_wordnet_pos(tag_parameter):

    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet

text = "The boys are running and the leaves are falling."

# Tokenize into words
words = word_tokenize(text)

# POS tagging
pos_tags = pos_tag(words)

# Lemmatization
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]

lemmatized_words

['The', 'boy', 'be', 'run', 'and', 'the', 'leaf', 'be', 'fall', '.']

## Spell checking

In [8]:
%pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
from spellchecker import SpellChecker

In [10]:
from nltk.tokenize import word_tokenize
from typing import List  # Import the List type from the typing module

def correct_sentence_spelling(tokens: List[str]) -> List[str]:
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    for i, token in enumerate(tokens):
        if token in misspelled:
            corrected = spell.correction(token)
            if corrected is not None:
                tokens[i] = corrected
    return tokens

text = "This is a sampli sentinse withh speling erors."
words = word_tokenize(text)
correct_sentence_spelling(words)

['This', 'is', 'a', 'sample', 'sentence', 'with', 'spelling', 'errors', '.']

## Stopwords

In [11]:
from nltk.corpus import stopwords

In [12]:
stopwords.words('English')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

## Remove stopwords from the text

In [13]:
filtered_text = []

text = "We are going to the ancient city of Damascus."


for word in word_tokenize(text):
    if word not in stopwords.words('English'):
        filtered_text.append(word)
        
filtered_text

['We', 'going', 'ancient', 'city', 'Damascus', '.']

## Remove Punctuation

In [14]:
import string

text = "Hello, world! This is some sample' text."

new_tokens = []
for token in text.split():
    new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))

print(new_tokens)

['Hello', 'world', 'This', 'is', 'some', 'sample', 'text']
