Read Source Data

In [1]:
# Load dataset
import pandas as pd
file_path = "Review.csv"
df = pd.read_csv(file_path)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df)

                                                                           Review
0   The product arrived on time. Packaging was great, and the quality is amazing!
1                                        THIS PRODUCT IS JUST AMAZING! I LOVE IT.
2     I bought this phone for $799, and it has a 120Hz display. Totally worth it!
3                         Wow!!! This product is awesome... but a bit expensive??
4                                             The laptop works perfectly fine.   
5    Check out the full product details here: https://example.com/product-details
6         <div><h2>Great Purchase!</h2><p>I am happy with this product.</p></div>
7                The battry life is excelent, but the chargin cable is too short.
8                       I can't believe it's so good! Didn't expect such quality.
9                   Love this product! ???? Fast delivery ??, amazing quality! ??
10                       TBH, I wasn't expecting much, but OMG, this is awesome!!
11              

Perform Text Pre-Processing Steps

In [2]:
# Lowercase conversion
def convert_to_lowercase(text):
    return text.lower()

df["lowercased"] = df["Review"].apply(convert_to_lowercase)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["lowercased"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5      check out the full product details here: https://example.com/product-details
6           <div><h2>great purchase!</h2><p>i am happy with this product.</p></div>
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [3]:
# Removal of URLs
import re

# remove any URLs that start with "http" or "www" from the text
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text) 

df["urls_removed"] = df["lowercased"].apply(remove_urls)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["urls_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6           <div><h2>great purchase!</h2><p>i am happy with this product.</p></div>
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [4]:
# Removal of HTML tags
from bs4 import BeautifulSoup

# extracts only the text, removing all HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["html_removed"] = df["urls_removed"].apply(remove_html_tags)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["html_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [5]:
# Removal of emojis (if any)
import emoji

# replace emoji with ''
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df["emojis_removed"] = df["html_removed"].apply(remove_emojis)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["emojis_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [6]:
# Replace internet slang/chat words
# Dictionary of slang words and their replacements
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

# Function to replace slang words
def replace_slang(text):
    # Create a list of escaped slang words
    escaped_slang_words = []  # Empty list to store escaped slang words

    for word in slang_dict.keys():
        escaped_word = re.escape(word)  # Ensure special characters are escaped
        escaped_slang_words.append(escaped_word)  # Add to list

    # Join the words using '|'
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

    # Define a replacement function
    def replace_match(match):
        slang_word = match.group(0)  # Extract matched slang word
        return slang_dict[slang_word.lower()]  # Replace with full form

    # Use regex to replace slang words with full forms
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)

    return replaced_text


# Apply the function to the column
df["slangs_replaced"] = df["emojis_removed"].apply(replace_slang)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["slangs_replaced"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10          to be honest, i wasn't expecting much, but oh my god, this is awesome!!
11                            this is the best product i have ever used in m

In [7]:
# Replace Contractions
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

# Build the regex pattern for contractions
escaped_contractions = []  # List to store escaped contractions

for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)  # Escape special characters (e.g., apostrophes)
    escaped_contractions.append(escaped_contraction)  # Add to list

# Join the escaped contractions with '|'
joined_contractions = "|".join(escaped_contractions)

# Create a regex pattern with word boundaries (\b)
contractions_pattern = r'\b(' + joined_contractions + r')\b'

# Compile the regex
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

# Define a function to replace contractions
def replace_contractions(text):
    # Function to handle each match found
    def replace_match(match):
        matched_word = match.group(0)  # Extract matched contraction
        lower_matched_word = matched_word.lower()  # Convert to lowercase
        expanded_form = contractions_dict[lower_matched_word]  # Get full form from dictionary
        return expanded_form  # Return the expanded form

    # Apply regex substitution
    expanded_text = compiled_pattern.sub(replace_match, text)

    return expanded_text  # Return modified text

# Apply the function to a DataFrame column
df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["contractions_replaced"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                      i cannot believe it is so good! did not expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10         to be honest, i was not expecting much, but oh my god, this is awesome!!
11                            this is the best product i have ever used in m

In [8]:
# Remove punctuations and special characters
import string

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the column
df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["punctuations_removed"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2        i bought this phone for 799 and it has a 120hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchasei am happy with this product
7                 the battry life is excelent but the chargin cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10           to be honest i was not expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [9]:
# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)  # Removes all numeric characters

# Apply the function to the column
df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["numbers_removed"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2              i bought this phone for  and it has a hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchasei am happy with this product
7                 the battry life is excelent but the chargin cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10           to be honest i was not expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [10]:
pip install autocorrect

Note: you may need to restart the kernel to use updated packages.


In [11]:
# Correct spelling mistakes
from autocorrect import Speller

# Initialize spell checker
spell = Speller(lang='en')

# Function to correct spelling
def correct_spelling(text):
    return spell(text)  # Apply correction

# Apply the function to the column
df["spelling_corrected"] = df["numbers_removed"].apply(correct_spelling)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["spelling_corrected"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2              i bought this phone for  and it has a hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchased am happy with this product
7              the battery life is excellent but the charging cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10           to be honest i was not expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [12]:
# Remove stopwords
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()  # Split text into words
    filtered_words = []  # Create an empty list to store words after stopword removal

    for word in words:  # Loop through each word in the list of words
        lower_word = word.lower()  # Convert the word to lowercase for uniform comparison
    
        if lower_word not in stop_words:  # Check if the lowercase word is NOT in the stopwords list
            filtered_words.append(word)  # If it's not a stopword, add it to the filtered list

    return " ".join(filtered_words)  # Join words back into a sentence

# Apply the function to the column
df["stopwords_removed"] = df["spelling_corrected"].apply(remove_stopwords)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["stopwords_removed"])

0          product arrived time packaging great quality amazing
1                                          product amazing love
2                         bought phone hz display totally worth
3                             wow product awesome bit expensive
4                                   laptop works perfectly fine
5                                    check full product details
6                                 great purchased happy product
7                   battery life excellent charging cable short
8                            cannot believe good expect quality
9                    love product fast delivery amazing quality
10                         honest expecting much oh god awesome
11                                  best product ever used life
12    shoes comfortable fitting nicely worked perfectly jogging
Name: stopwords_removed, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Stemming - reduces words to their base root by chopping off suffixes
from nltk.stem import PorterStemmer

# Initialize the stemmer
stemmer = PorterStemmer()

# Function to apply stemming
def stem_text(text):
    if not isinstance(text, str):
        return ""

    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming
    return " ".join(stemmed_words)

# Apply the function
df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["stemmed_words"])

0     product arriv time packag great qualiti amaz
1                                product amaz love
2              bought phone hz display total worth
3                    wow product awesom bit expens
4                       laptop work perfectli fine
5                        check full product detail
6                      great purchas happi product
7              batteri life excel charg cabl short
8                cannot believ good expect qualiti
9          love product fast deliveri amaz qualiti
10                honest expect much oh god awesom
11                      best product ever use life
12        shoe comfort fit nice work perfectli jog
Name: stemmed_words, dtype: object


In [23]:
import nltk

# Download the required resources
nltk.download('wordnet')                    # For lemmatization
nltk.download('omw-1.4')                     # WordNet lexical database
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging
nltk.download('punkt_tab')                       # For tokenization

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [25]:
# Lemmatization - reduces words to their base dictionary form (lemma)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif nltk_tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Function to lemmatize text with POS tagging
def lemmatize_text(text):
    if not isinstance(text, str):  # Ensure input is a string
        return ""

    words = word_tokenize(text)  # Tokenize text into words
    pos_tags = pos_tag(words)  # Get POS tags
    
    # Lemmatize each word with its correct POS tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return " ".join(lemmatized_words)  # Join words back into a sentence

# Apply the function to the column
df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["lemmatized"])

0     product arrive time packaging great quality amazing
1                                      product amaze love
2                      buy phone hz display totally worth
3                       wow product awesome bit expensive
4                              laptop work perfectly fine
5                               check full product detail
6                            great purchase happy product
7               battery life excellent charge cable short
8                     can not believe good expect quality
9              love product fast delivery amazing quality
10                      honest expect much oh god awesome
11                             best product ever use life
12         shoe comfortable fit nicely work perfectly jog
Name: lemmatized, dtype: object


In [29]:
import nltk
from nltk.tokenize import word_tokenize

# Download tokenizer if not already available
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return []
    return word_tokenize(text)  # Tokenize text into words

# Apply tokenization to the column
df["tokenized"] = df["lemmatized"].apply(tokenize_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["tokenized"])

0     [product, arrive, time, packaging, great, quality, amazing]
1                                          [product, amaze, love]
2                       [buy, phone, hz, display, totally, worth]
3                         [wow, product, awesome, bit, expensive]
4                                 [laptop, work, perfectly, fine]
5                                  [check, full, product, detail]
6                               [great, purchase, happy, product]
7                [battery, life, excellent, charge, cable, short]
8                      [can, not, believe, good, expect, quality]
9               [love, product, fast, delivery, amazing, quality]
10                       [honest, expect, much, oh, god, awesome]
11                               [best, product, ever, use, life]
12         [shoe, comfortable, fit, nicely, work, perfectly, jog]
Name: tokenized, dtype: object


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Save to file

In [None]:
df.to_csv("Processed_Reviews.csv", index=False)  # Saves without the index column