# Data Cleaning in NLP

In [66]:
import re
import numpy as np
import pandas as pd

In [67]:
# lOWERCASING -- lOWER CASE ALL THE LETTERS IN ANY DATASET SO THAT EVERY SAME WORD IS NOT COUNTED TWICE 

In [68]:
import pandas as pd

# Sample dataframe with a column containing text data
data = {'Text': ["The Quick Brown Fox", "Jumps Over The Lazy Dog", "Hello World"]}
df = pd.DataFrame(data)

# Lowercase the entire 'Text' column
df['Text'] = df['Text'].str.lower()

# Print the dataframe to see the result
print(df)

                      Text
0      the quick brown fox
1  jumps over the lazy dog
2              hello world


In [69]:
# Remove html tags -- they do not provide any meaning to a sentence

In [70]:
import re

# Sample HTML text
html_text = "<p>This is <strong>HTML</strong> text with <a href='#'>tags</a>.</p>"

# Remove HTML tags using regular expressions
plain_text = re.sub(r'<[^>]+>', '', html_text)

# Print the plain text without HTML tags
print(plain_text)

This is HTML text with tags.


In [71]:
# Can try the below function as well

In [72]:
# Sample dataframe with a column containing HTML text
data = {'HTML_Text': ["<p>This is <strong>HTML</strong> text with <a href='#'>tags</a>.</p>",
                      "<p>Another <em>example</em> with <a href='#'>links</a>.</p>",
                      "<p>Yet <u>another</u> <strong>example</strong>.</p>"]}
df = pd.DataFrame(data)

# Function to remove HTML tags from a string
def remove_html_tags(text):
    clean_text = re.sub(r'<[^>]+>', '', text)
    return clean_text

# Apply remove_html_tags function to the entire 'HTML_Text' column
df['Plain_Text'] = df['HTML_Text'].apply(remove_html_tags)

# Print the dataframe with plain text
print(df)

                                           HTML_Text  \
0  <p>This is <strong>HTML</strong> text with <a ...   
1  <p>Another <em>example</em> with <a href='#'>l...   
2  <p>Yet <u>another</u> <strong>example</strong>...   

                     Plain_Text  
0  This is HTML text with tags.  
1   Another example with links.  
2          Yet another example.  


In [73]:
# Remove urls 

In [74]:
# Sample dataframe with a column containing text with URLs
data = {'Text_with_URLs': ["Check out this website: https://www.example.com",
                           "Visit our site at www.another-example.com",
                           "More info available at http://example.org"]}

df = pd.DataFrame(data)

# Function to remove URLs from a string
def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    clean_text = re.sub(url_pattern, '', text)
    return clean_text

# Apply remove_urls function to the entire column
df['Text_without_URLs'] = df['Text_with_URLs'].apply(remove_urls)

# Print the dataframe with text without URLs
print(df)

                                    Text_with_URLs         Text_without_URLs
0  Check out this website: https://www.example.com  Check out this website: 
1        Visit our site at www.another-example.com        Visit our site at 
2        More info available at http://example.org   More info available at 


In [75]:
# Remove Punctuations 

In [76]:
import string

# Sample text containing punctuation
text_with_punctuation = "Hello, world! This is a sample text."

# Remove punctuation using string.punctuation
text_without_punctuation = ''.join(char for char in text_with_punctuation if char not in string.punctuation)

# Print text without punctuation
print(text_without_punctuation)

Hello world This is a sample text


In [77]:
import pandas as pd
import string

# Sample DataFrame with a column containing text with punctuation
data = {'Text_with_Punctuation': ["Hello, world!", "This is a sample text with punctuation!", "How are you?"]}
df = pd.DataFrame(data)

# Define a function to remove punctuation from a string
def remove_punctuation(text):
    return ''.join(char for char in text if char not in string.punctuation)

# Apply the remove_punctuation function to the entire column
df['Text_without_Punctuation'] = df['Text_with_Punctuation'].apply(remove_punctuation)

# Print the DataFrame with the updated column
print(df)

                     Text_with_Punctuation  \
0                            Hello, world!   
1  This is a sample text with punctuation!   
2                             How are you?   

                 Text_without_Punctuation  
0                             Hello world  
1  This is a sample text with punctuation  
2                             How are you  


In [78]:
# Chat Word Treatment

In [79]:
chat_words = {'AFK': 'Away from Keyboard', 'FYI': 'For your Information','u2' : 'you too'}

In [80]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [81]:
chat_conversion('FYI')

'For your Information'

In [87]:
# Unicode Normalisation

In [82]:
emogi_text = "What does the 🍆 💦 emoji mean?"


In [83]:
emogi_text

'What does the 🍆 💦 emoji mean?'

In [84]:
emogi_text.encode('utf-8')

b'What does the \xf0\x9f\x8d\x86 \xf0\x9f\x92\xa6 emoji mean?'

In [85]:
# Spellings_Check

In [86]:
pip install pyspellchecker

Note: you may need to restart the kernel to use updated packages.




In [36]:
from spellchecker import SpellChecker

# Sample DataFrame with a column containing text with spelling errors
data = {'Text_with_Errors': ["Thiss is a samplee sentence withh speling errrors.",
                              "I havvee somme speleng mistakess in this sentencc."]}
df = pd.DataFrame(data)

# Initialize spell checker
spell = SpellChecker()

# Define a function to correct spelling errors in a string
def correct_spelling(text):
    corrected_text = []
    for word in text.split():
        corrected_text.append(spell.correction(word))
    return ' '.join(corrected_text)

# Apply the correct_spelling function to the entire column
df['Corrected_Text'] = df['Text_with_Errors'].apply(correct_spelling)

# Print the DataFrame with the corrected column
print(df)

                                    Text_with_Errors  \
0  Thiss is a samplee sentence withh speling errr...   
1  I havvee somme speleng mistakess in this sente...   

                                   Corrected_Text  
0  this is a sample sentence with spelling errors  
1  I have some spelling mistakes in this sentence  


In [37]:
# Another library for spelling checks -- text blob

In [42]:
from textblob import TextBlob
import pandas as pd

# Sample DataFrame with a column containing text with spelling errors
data = {'Text_with_Errors': ["Thiss is a samplee sentence withh speling errrors.",
                              "I havvee somme speleng mistakess in this sentencc."]}

df = pd.DataFrame(data)

# Define a function to correct spelling errors in a string
def correct_spelling(text):
    corrected_text = []
    for word in text.split():
        corrected_text.append(str(TextBlob(word).correct()))
    return ' '.join(corrected_text)

# Apply the correct_spelling function to the entire column
df['Corrected_Text'] = df['Text_with_Errors'].apply(correct_spelling)

# Print the DataFrame with the corrected column
print(df)

                                    Text_with_Errors  \
0  Thiss is a samplee sentence withh speling errr...   
1  I havvee somme speleng mistakess in this sente...   

                                    Corrected_Text  
0  Hiss is a sample sentence with spelling errors.  
1    I have some spleen mistakes in this sentence.  


In [43]:
# Remove Stopwords : Stop words are common words that are often filtered out during text preprocessing in natural language processing (NLP) tasks. These words are considered to be non-informative or redundant in the context of analyzing text data because they occur frequently across documents and do not carry significant meaning. Examples of stop words include articles (e.g., "a", "an", "the"), prepositions (e.g., "in", "on", "at"), conjunctions (e.g., "and", "or", "but"), and common verbs (e.g., "is", "are", "have").

In [48]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample DataFrame with a column containing text data
data = {'Text': ["This is a sample sentence demonstrating the removal of stop words.",
                 "Stop words are common words that are filtered out during text preprocessing.",
                 "Removing stop words helps reduce dimensionality and improve efficiency in NLP tasks."]}
df = pd.DataFrame(data)

# Download NLTK stopwords (you need to do this only once)
nltk.download('stopwords')
nltk.download('punkt')

# Tokenize function with stop word removal
stop_words = set(stopwords.words('english'))
tokenize_and_remove_stopwords = lambda text: ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words])

# Apply tokenize_and_remove_stopwords function to the entire column
df['Text_without_stopwords'] = df['Text'].apply(tokenize_and_remove_stopwords)

# Print the DataFrame with the column containing text without stop words
print(df)

                                                Text  \
0  This is a sample sentence demonstrating the re...   
1  Stop words are common words that are filtered ...   
2  Removing stop words helps reduce dimensionalit...   

                              Text_without_stopwords  
0  sample sentence demonstrating removal stop wor...  
1  Stop words common words filtered text preproce...  
2  Removing stop words helps reduce dimensionalit...  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# Emogi --- To remove

In [50]:
import pandas as pd
import re

# Sample DataFrame with a column containing text data with emojis
data = {'Text_with_Emojis': ["I love Python! 😍🐍", "Just received a gift 🎁", "Feeling happy today 😀"]}
df = pd.DataFrame(data)

# Define a function to remove emojis from a string
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the remove_emojis function to the entire column
df['Text_without_Emojis'] = df['Text_with_Emojis'].apply(remove_emojis)

# Print the DataFrame with the new column containing text without emojis
print(df)

         Text_with_Emojis    Text_without_Emojis
0       I love Python! 😍🐍        I love Python! 
1  Just received a gift 🎁  Just received a gift 
2   Feeling happy today 😀   Feeling happy today 


In [51]:
# Or can keep emoji in terms of text
import emoji
print(emoji.demojize("What does the 🍆 💦 emoji mean? 🔥 Fire Emoji"))

What does the :eggplant: :sweat_droplets: emoji mean? :fire: Fire Emoji


In [52]:
# Tokenisation : Tokenization is the process of breaking down text into smaller units, typically words or subwords, which are called tokens. These tokens serve as the basic units of analysis in natural language processing (NLP) tasks. Tokenization is a fundamental step in NLP preprocessing pipelines, as it allows computers to understand and process human language.

In [35]:
def tokenize(text):
    split=re.split("\W+",text) 
    return split
tokenize("What does the emoji mean?")

['What', 'does', 'the', 'emoji', 'mean', '']

# Tokenisation

In [40]:
#1. Use split function directly

In [41]:
sent = "I am goint to delhi"
sent.split()

['I', 'am', 'goint', 'to', 'delhi']

In [45]:
#2. Sentence Split
sent2 = "I am going to delhi. I am work at an office. I am yash"
sent2.split('.')

['I am going to delhi', ' I am work at an office', ' I am yash']

In [46]:
# Problems with Split Function

In [47]:
sent3 = "I am going to New Delhi!!!"
sent3.split()

['I', 'am', 'going', 'to', 'New', 'Delhi!!!']

In [48]:
sent4 = 'Where do you go ? How r u'
sent4.split(".")

['Where do you go ? How r u']

In [49]:
#2. Regular Expression

In [51]:
import re
sent = "I am going to# delhi!"
tokens = re.findall("[\w']+", sent)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [52]:
#3. Using ibrary

In [53]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [55]:
sent = "I am going to delhi !!!"
word_tokenize(sent)

['I', 'am', 'going', 'to', 'delhi', '!', '!', '!']

In [56]:
sent = "Hi Mr. I am going to do P.Hd in A.I"
word_tokenize(sent)

['Hi', 'Mr', '.', 'I', 'am', 'going', 'to', 'do', 'P.Hd', 'in', 'A.I']

In [57]:
sent = "Give me 100$"
word_tokenize(sent)

['Give', 'me', '100', '$']

In [58]:
sent2 = "I am here to help, mail me at xyz@gmail.com"
word_tokenize(sent2)

['I',
 'am',
 'here',
 'to',
 'help',
 ',',
 'mail',
 'me',
 'at',
 'xyz',
 '@',
 'gmail.com']

In [62]:
sent3 = "A 50km ride costs something around $50"
word_tokenize(sent3)

['A', '50km', 'ride', 'costs', 'something', 'around', '$', '50']

In [None]:
# 4. Best library name Spacy -- recommended

In [59]:
import pandas as pd
import spacy

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Sample DataFrame with a column containing text data
data = {'Text': ["Tokenization is the process of breaking down text into smaller units.",
                 "Each unit, typically words or subwords, is called a token."]}
df = pd.DataFrame(data)

# Function for tokenization
def tokenize_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Extract tokens from the processed document
    tokens = [token.text for token in doc]
    return tokens

# Apply tokenization function to the entire column
df['Tokens'] = df['Text'].apply(tokenize_text)

# Print the DataFrame with the new column containing tokens
print(df)

                                                Text  \
0  Tokenization is the process of breaking down t...   
1  Each unit, typically words or subwords, is cal...   

                                              Tokens  
0  [Tokenization, is, the, process, of, breaking,...  
1  [Each, unit, ,, typically, words, or, subwords...  


Stemming : Stemming is the process of reducing a word to its base or root form, known as the stem. It involves removing suffixes or prefixes from words to extract the core meaning. The goal of stemming is to normalize words so that variations of the same word are treated as the same word, regardless of their grammatical forms.

Inflection : In very simple terms, inflection is when we change a word to show different meanings or uses. For example, when we add "ed" to "walk" to make "walked" to show that it happened in the past. Or when we add "s" to "cat" to make "cats" to show there's more than one. It's like changing the shape or form of a word to fit different situations or roles in a sentence.

Used mostly in information retrival

It is not 100 % correct so if we want more accuracy we can use lemmatisation

In [88]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer

# Sample DataFrame with a column containing text data
data = {'Text': ["running fishes", "cats playing","I will start walking a bit fast tomorrow because walking is good for health."]}
df = pd.DataFrame(data)

# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Function for stemming
def stem_text(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Stem each word in the text
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the stemmed words into a single string
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

# Apply stemming function to the entire column
df['Stemmed_Text'] = df['Text'].apply(stem_text)

# Print the DataFrame with the new column containing stemmed text
print(df)

                                                Text  \
0                                     running fishes   
1                                       cats playing   
2  I will start walking a bit fast tomorrow becau...   

                                        Stemmed_Text  
0                                           run fish  
1                                           cat play  
2  i will start walk a bit fast tomorrow becaus w...  


 Lammetiser -- Slower than stemming but more accurate

 Lemmatisation is the process of reducing words to their base or dictionary form, known as the lemma. It's like finding the root word for variations of the same word.

Whether lemmatization or stemming is better depends on the specific task and the requirements of your NLP application. Here's a comparison of both:

Lemmatization:

Lemmatization generally produces more accurate results compared to stemming because it maps words to their dictionary forms (lemmas), which are valid words.
Lemmatization takes into account the context of the word and its part of speech, resulting in more meaningful lemmas.
However, lemmatization is computationally more expensive and slower compared to stemming.
Stemming:

Stemming is faster and computationally less expensive compared to lemmatization because it applies simple rules to chop off prefixes or suffixes.
Stemming may produce stems that are not actual words, leading to ambiguity or loss of meaning.
Stemming is less accurate compared to lemmatization, especially in languages with complex morphology.

In general, if you need higher precision and accuracy in your NLP task, and you can afford the computational cost, lemmatization is a better choice. However, if speed and efficiency are more critical, and you can tolerate some loss of precision, stemming may be sufficient.

Ultimately, it's essential to experiment with both techniques and evaluate their performance in the context of your specific NLP task to determine which one works better for your application. Additionally, you may also consider hybrid approaches that combine both stemming and lemmatization to achieve a balance between accuracy and efficiency.







In [89]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer

# Sample DataFrame with a column containing text data
data = {'Text': ["running fishes", "cats playing"]}
df = pd.DataFrame(data)

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function for lemmatization
def lemmatize_text(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Lemmatize each word in the text
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Join the lemmatized words into a single string
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Apply lemmatization function to the entire column
df['Lemmatized_Text'] = df['Text'].apply(lemmatize_text)

# Print the DataFrame with the new column containing lemmatized text
print(df)

             Text Lemmatized_Text
0  running fishes    running fish
1    cats playing     cat playing
