In [11]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Load file
df = pd.read_csv("UNITENReview.csv", encoding="utf-")

# NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("punkt")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def remove_urls(t):
    return re.sub(r"http\S+|www\S+", "", t)

def remove_html(t):
    return BeautifulSoup(t, "html.parser").get_text()

def remove_emojis(t):
    return emoji.replace_emoji(t, replace="")

slang = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "i don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
}

def replace_slang(t):
    pattern = r"\b(" + "|".join(map(re.escape, slang.keys())) + r")\b"
    return re.sub(pattern, lambda m: slang[m.group(0).lower()], t, flags=re.IGNORECASE)

def normalize_apostrophe(t):
    return t.replace("â€™", "'")

contractions = {
    "i'm": "i am",
    "it's": "it is",
    "that's": "that is",
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
}

def replace_contractions(t):
    t = normalize_apostrophe(t)
    pattern = r"\b(" + "|".join(map(re.escape, contractions.keys())) + r")\b"
    return re.sub(
        pattern,
        lambda m: contractions[m.group(0).lower()],
        t,
        flags=re.IGNORECASE,
    )

def fix_letter_comma_letter(t):
    # "transportation,wifi" -> "transportation wifi"
    return re.sub(r"([a-zA-Z])[,;/]([a-zA-Z])", r"\1 \2", t)

def remove_punct(t):
    return t.translate(str.maketrans("", "", string.punctuation))

def remove_numbers(t):
    return re.sub(r"\d+", "", t)

def remove_stopwords(t):
    return " ".join([w for w in t.split() if w not in stop_words])

def get_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN

def lemmatize_text(t):
    words = word_tokenize(t)
    tags = pos_tag(words)
    return " ".join([lemmatizer.lemmatize(w, get_pos(tag)) for w, tag in tags])

def clean(t):
    if not isinstance(t, str):
        return ""
    t = t.lower()
    t = remove_urls(t)
    t = remove_html(t)
    t = remove_emojis(t)
    t = replace_slang(t)
    t = replace_contractions(t)
    t = fix_letter_comma_letter(t)
    t = remove_punct(t)
    t = remove_numbers(t)
    t = remove_stopwords(t)
    t = lemmatize_text(t)
    return t

df["final"] = df["Review"].apply(clean)

# print like the lab example
for i, text in enumerate(df["final"]):
    print(i, text)

# save only final cleaned text
df[["final"]].to_csv("Processed_UNITENReview.csv", index=False)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 12564: character maps to <undefined>