# Pre-processing steps of the Collected Data

### 1. Load Datasets

In [30]:
#Load Datasets
import pandas as pd
file_path = "extracted-reviews.csv"
df = pd.read_csv(file_path)

# Display column content without truncation
pd.set_option('display.max_colwidth', None) #Set to None for unlimited width
print(df)

                                                                                                                   google-link  \
0          https://lh3.googleusercontent.com/a-/ALV-UjWIyPjByq2ksozdvz5_SdR8zYvdRhhMjwrQldI-8FFQdEuWIE3G=w36-h36-p-rp-mo-br100   
1       https://lh3.googleusercontent.com/a-/ALV-UjUPeZrhls5ywqZt9CQ_7kpBOD54Kss5P5CusE-KCrwlX8Z2Lfw=w36-h36-p-rp-mo-ba6-br100   
2    https://lh3.googleusercontent.com/a-/ALV-UjXh6FqgpYfmdCPetLMXu069LnqtRmGlTIJf9U05p9dg2622uZDviw=w36-h36-p-rp-mo-ba3-br100   
3      https://lh3.googleusercontent.com/a-/ALV-UjW0r7gMvGVS3BCUSmbdy4mzlfSdJsEL1_9J9oU4Je5IABcT5umI=w36-h36-p-rp-mo-ba3-br100   
4      https://lh3.googleusercontent.com/a-/ALV-UjVhbz11TIeUOx9qQwiPyOozuBIZWVKLRqJDdip5Z0ahnhjivqCa=w36-h36-p-rp-mo-ba2-br100   
..                                                                                                                         ...   
355         https://lh3.googleusercontent.com/a-/ALV-UjX9XxhvJp1Ci0qtZO8zFDqHyd0oVoIcs6qkj

In [31]:
# Remove the first column by its index (0)
df = df.drop(df.columns[0], axis=1)
print(df)

                username                               Past Action  \
0    Gayathiri Sivakumar                      3 reviews · 5 photos   
1             Azam Kamal  Local Guide · 205 reviews · 1,571 photos   
2          Malihah Yusof      Local Guide · 30 reviews · 71 photos   
3     Nurul Azida (Zida)      Local Guide · 16 reviews · 58 photos   
4         Najihah Pazaer       Local Guide · 5 reviews · 19 photos   
..                   ...                                       ...   
355        nida fathinah                       8 reviews · 1 photo   
356        Zafirah Ishak       Local Guide · 19 reviews · 7 photos   
357          Syad Hamran      Local Guide · 12 reviews · 45 photos   
358   Nurul humairah Moe                       3 reviews · 1 photo   
359            alep jaww                                13 reviews   

             Time  \
0    2 months ago   
1    2 months ago   
2    3 months ago   
3    4 months ago   
4    2 months ago   
..            ...   
355   2 year

### 2. Importing relevant modules and functions.

In [32]:
import pandas as pd 
import re 
import emoji 
import string 
import nltk 
 
from bs4 import BeautifulSoup 
from autocorrect import Speller 
from nltk.corpus import stopwords, wordnet 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag 
 
# Download required NLTK resources 
nltk.download('stopwords') 
nltk.download('wordnet')                    # For lemmatization 
nltk.download('omw-1.4')                     # WordNet lexical database 
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging 
nltk.download('punkt_tab')                       # For tokenization 

# Initialize tools 
spell = Speller(lang='en') 
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [33]:
# Dictionary of slang words and their replacements 
slang_dict = { 
    "tbh": "to be honest", 
    "omg": "oh my god", 
    "lol": "laugh out loud", 
    "idk": "I don't know", 
    "brb": "be right back", 
    "btw": "by the way", 
    "imo": "in my opinion", 
    "smh": "shaking my head", 
    "fyi": "for your information", 
    "np": "no problem", 
    "ikr": "I know right", 
    "asap": "as soon as possible", 
    "bff": "best friend forever", 
    "gg": "good game", 
    "hmu": "hit me up", 
    "rofl": "rolling on the floor laughing",
    "w": "win"
} 
 
# Contractions dictionary 
contractions_dict = { 
    "wasn't": "was not", 
    "isn't": "is not", 
    "aren't": "are not", 
    "weren't": "were not", 
    "doesn't": "does not", 
    "don't": "do not", 
    "didn't": "did not", 
    "can't": "cannot", 
    "couldn't": "could not", 
    "shouldn't": "should not", 
    "wouldn't": "would not", 
    "won't": "will not", 
    "haven't": "have not", 
    "hasn't": "has not", 
    "hadn't": "had not", 
    "i'm": "i am", 
    "you're": "you are", 
    "he's": "he is", 
    "she's": "she is", 
    "it's": "it is", 
    "we're": "we are", 
    "they're": "they are", 
    "i've": "i have", 
    "you've": "you have", 
    "we've": "we have", 
    "they've": "they have", 
    "i'd": "i would", 
    "you'd": "you would", 
    "he'd": "he would", 
    "she'd": "she would", 
    "we'd": "we would", 
    "they'd": "they would", 
    "i'll": "i will", 
    "you'll": "you will", 
    "he'll": "he will", 
    "she'll": "she will", 
    "we'll": "we will", 
    "they'll": "they will", 
    "let's": "let us", 
    "that's": "that is", 
    "who's": "who is", 
    "what's": "what is", 
    "where's": "where is", 
    "when's": "when is", 
    "why's": "why is" 
}

#### There are several functions involve in the preprocessing.
#### 1. Lowercase function: Convert the string to lowercase
#### 2. No_URLs: Remove URL in the string
#### 3. No_HTML: Remove HTML links in the string
#### 4. No_emojis: Remove emoji in the string
#### 5. slang_replaced: Replaced the existing slangs with the correct one
#### 6. contractions_replaced: Replaced the contractions with the correct one
#### 7. no_punctuation: Remove punctuations from the string
#### 8. no_numbers: Remove numbers from the string
#### 9. spelling_corrected: Correct any mispelling in the string
#### 10. no_stopwords: Remove all stopwords in the string
#### 11. lemmatized: lemmatize the whole string
#### 12. tokenized: tokenized the whole string

In [34]:
# Remove any URLs that start with "http" or "www" from the text 
def remove_urls(text): 
    return re.sub(r'http\S+|www\S+', '', text)

In [35]:
# extracts only the text, removing all HTML tags 
def remove_html(text): 
    return BeautifulSoup(text, "html.parser").get_text()

In [36]:
# replace emoji with '' 
def remove_emojis(text): 
    return emoji.replace_emoji(text, replace='')

In [37]:
# Replace internet slang/chat words 
def replace_slang(text): 
    # Create a list of escaped slang words 
    escaped_slang_words = []  # Empty list to store escaped slang words 
 
    for word in slang_dict.keys(): 
        escaped_word = re.escape(word)  # Ensure special characters are escaped 
        escaped_slang_words.append(escaped_word)  # Add to list 
 
    # Join the words using '|' 
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b' 
 
    # Define a replacement function 
    def replace_match(match): 
        slang_word = match.group(0)  # Extract matched slang word 
        return slang_dict[slang_word.lower()]  # Replace with full form 
 
    # Use regex to replace slang words with full forms 
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE) 
 
    return replaced_text

In [38]:
# Function to expand contractions 
# Build the regex pattern for contractions 
escaped_contractions = []  # List to store escaped contractions 
 
for contraction in contractions_dict.keys(): 
    escaped_contraction = re.escape(contraction)  # Escape special characters (e.g., apostrophes) 
    escaped_contractions.append(escaped_contraction)  # Add to list 
 
# Join the escaped contractions with '|' 
joined_contractions = "|".join(escaped_contractions) 
 
# Create a regex pattern with word boundaries (\b) 
contractions_pattern = r'\b(' + joined_contractions + r')\b' 
 
# Compile the regex 
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE) 
 
# Define a function to replace contractions 
def replace_contractions(text): 
    # Function to handle each match found 
    def replace_match(match): 
        matched_word = match.group(0)  # Extract matched contraction 
        lower_matched_word = matched_word.lower()  # Convert to lowercase 
        expanded_form = contractions_dict[lower_matched_word]  # Get full form from dictionary 
        return expanded_form  # Return the expanded form 
 
    # Apply regex substitution 
    expanded_text = compiled_pattern.sub(replace_match, text) 
    return expanded_text  # Return modified text

In [39]:
# Function to remove punctuation 
def remove_punctuation(text): 
    return text.translate(str.maketrans('', '', string.punctuation))

In [40]:
# Function to remove numbers 
def remove_numbers(text): 
    return re.sub(r'\d+', '', text)

In [41]:
# Function to correct spelling using AutoCorrect 
def correct_spelling(text): 
    return spell(text)  # Apply correction

In [42]:
# Function to remove stopwords 
def remove_stopwords(text): 
    words = text.split() 
    filtered_words = [word for word in words if word.lower() not in stop_words] 
    return " ".join(filtered_words) 

In [43]:
# Function to map NLTK POS tags to WordNet POS tags 
def get_wordnet_pos(nltk_tag): 
    if nltk_tag.startswith('J'):  # Adjective 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'):  # Verb 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'):  # Noun 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'):  # Adverb 
        return wordnet.ADV 
    else: 
        return wordnet.NOUN  # Default to noun

In [44]:
# Function to lemmatize text with POS tagging 
def lemmatize_text(text): 
    if not isinstance(text, str):  # Ensure input is a string 
        return "" 
 
    words = word_tokenize(text)  # Tokenize text into words 
    pos_tags = pos_tag(words)  # Get POS tags 
    # Lemmatize each word with its correct POS tag 
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags] 
    return " ".join(lemmatized_words)  # Join words back into a sentence 

In [45]:
# Function to tokenize text 
def tokenize_text(text): 
    if not isinstance(text, str):  # Ensure the input is a string 
        return [] 
    return word_tokenize(text)  # Tokenize text into words

In [46]:
# Apply preprocessing pipeline
df['lowercase'] = df['Review'].str.lower()
df['no_urls'] = df['lowercase'].apply(remove_urls)
df['no_html'] = df['no_urls'].apply(remove_html)
df['no_emojis'] = df['no_html'].apply(remove_emojis)
df['slang_replaced'] = df['no_emojis'].apply(replace_slang)
df['contractions_replaced'] = df['slang_replaced'].apply(replace_contractions)
df['no_punctuation'] = df['contractions_replaced'].apply(remove_punctuation)
df['no_numbers'] = df['no_punctuation'].apply(remove_numbers)
df['spelling_corrected'] = df['no_numbers'].apply(correct_spelling)
df['no_stopwords'] = df['spelling_corrected'].apply(remove_stopwords)
df['lemmatized'] = df['no_stopwords'].apply(lemmatize_text)
df['tokenized'] = df['lemmatized'].apply(tokenize_text)
df['processed_review'] = df['tokenized']

  return BeautifulSoup(text, "html.parser").get_text()


In [47]:
print(df.columns)

Index(['username', 'Past Action', 'Time', 'Review', 'lowercase', 'no_urls',
       'no_html', 'no_emojis', 'slang_replaced', 'contractions_replaced',
       'no_punctuation', 'no_numbers', 'spelling_corrected', 'no_stopwords',
       'lemmatized', 'tokenized', 'processed_review'],
      dtype='object')


In [48]:
# Save the cleaned dataset 
df.to_csv("processed-reviewsv2.csv", index=False) 

In [50]:
# Display the first few rows 
print(df[["Review", "processed_review"]].head()) 

                                                                                                                                                                                                                                                                          Review  \
0                             I recently visited Suka Desert and tried a variety of their offerings. The pancake was exceptionally fluffy and enjoyable, while the vanilla 3-phase ice cream was delightful overall, though the melted chocolate phase was overly sweet for my …   
1                                                                                                                                                                     My review is simple because I love chocolate. Of course, it’s sweet—after all, that’s what chocolate is! …   
2                             Not sure why people are so hype about this. They sure have various dessert but the cakes taste just like the ones you'd get at regular bakery.