In [4]:
# Perform Below Listed Preprocessing Task in proper order as per your dataset using torch or tensorflow.
# Emoji Removal 
# Lemmatization 
# Stemming 
# Word Tokenization 
# Grammar Correction
# Http Links Removal 
# Stop Words Removal
# Sentence Tokenization 
# Lower casing
#  Remove white spaces
# Text Normalization
# Part of speech tagging

In [5]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob

In [8]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
text_data = "Your text data goes here."

text_data = text_data.encode('ascii', 'ignore').decode('ascii')

text_data = re.sub(r'http\S+', '', text_data)

#Lower casing
text_data = text_data.lower()

#Remove white spaces
text_data = ' '.join(text_data.split())

# Word Tokenization
words = word_tokenize(text_data)

#Sentence Tokenization
sentences = sent_tokenize(text_data)

#Part of speech tagging
pos_tags = nltk.pos_tag(words)

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

corrected_text = str(TextBlob(text_data).correct())

In [10]:

print("Original Text:", text_data)
print("Emoji Removed Text:", text_data)
print("Http Links Removed Text:", text_data)
print("Lowercased Text:", text_data)
print("Text with White Spaces Removed:", text_data)
print("Word Tokenization:", words)
print("Sentence Tokenization:", sentences)
print("Part of Speech Tagging:", pos_tags)
print("Stop Words Removed:", filtered_words)
print("Stemming:", stemmed_words)
print("Lemmatization:", lemmatized_words)
print("Grammr Corrected Text:", corrected_text)

Original Text: your text data goes here.
Emoji Removed Text: your text data goes here.
Http Links Removed Text: your text data goes here.
Lowercased Text: your text data goes here.
Text with White Spaces Removed: your text data goes here.
Word Tokenization: ['your', 'text', 'data', 'goes', 'here', '.']
Sentence Tokenization: ['your text data goes here.']
Part of Speech Tagging: [('your', 'PRP$'), ('text', 'NN'), ('data', 'NN'), ('goes', 'VBZ'), ('here', 'RB'), ('.', '.')]
Stop Words Removed: ['text', 'data', 'goes', '.']
Stemming: ['text', 'data', 'goe', '.']
Lemmatization: ['text', 'data', 'go', '.']
Grammr Corrected Text: your text data goes here.
