In [1]:
import json
import re

import polars as pl
import string

from textblob import TextBlob
import emoji

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pl.read_csv("https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv")

In [6]:
print(data)

shape: (50_000, 2)
┌─────────────────────────────────┬───────────┐
│ review                          ┆ sentiment │
│ ---                             ┆ ---       │
│ str                             ┆ str       │
╞═════════════════════════════════╪═══════════╡
│ One of the other reviewers has… ┆ positive  │
│ A wonderful little production.… ┆ positive  │
│ I thought this was a wonderful… ┆ positive  │
│ Basically there's a family whe… ┆ negative  │
│ Petter Mattei's "Love in the T… ┆ positive  │
│ …                               ┆ …         │
│ I thought this movie did a dow… ┆ positive  │
│ Bad plot, bad dialogue, bad ac… ┆ negative  │
│ I am a Catholic taught in paro… ┆ negative  │
│ I'm going to have to disagree … ┆ negative  │
│ No one expects the Star Trek m… ┆ negative  │
└─────────────────────────────────┴───────────┘


In [9]:
data.glimpse()

Rows: 50000
Columns: 2
$ review    <str> "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the

In [3]:
data.head()

review,sentiment
str,str
"""One of the other reviewers has…","""positive"""
"""A wonderful little production.…","""positive"""
"""I thought this was a wonderful…","""positive"""
"""Basically there's a family whe…","""negative"""
"""Petter Mattei's ""Love in the T…","""positive"""


In [None]:
data.columns

list

In [5]:
data.describe()

statistic,review,sentiment
str,str,str
"""count""","""50000""","""50000"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""","""A Turkish Bath sequence in…","""negative"""
"""25%""",,
"""50%""",,
"""75%""",,
"""max""","""ý thýnk uzak ýs the one of the…","""positive"""


### Tasks list for data cleanup
- tokenization
- uppercase to lowercase
- emoji removal
- panctuation removal
- html / url tag removal
- stopword removal
- abbreviation or slang correction
- stemming and lemmetization
- spelling correction
- whitespace removal

In [4]:
type(data["review"])

polars.series.series.Series

#### Test dataframe to check functionality

In [None]:
data_bkup = data.clone() # create backup in case you want to go back to original data

In [2]:
punctuation_marks = string.punctuation
punctuation_pattern = f"[{re.escape(punctuation_marks)}]"

In [21]:
# Example DataFrame
data = pl.DataFrame({
    "review": [
        "This is a  great product!!!  ",
        "Check   it  out...  It works well, right?  ",
        "LMAO this product is gr8",
        "afaik, it wrks for me but ATM I have no idea.",
        "AFAIK, this is the best prodcut, ATM it works fine.",
        "I love this product! 😊👍",
        None
    ],
    "comments": [
        "😍Great service!! Highly recommend... ",
        "Not bad... could be improved!! ",
        None,
        "I want it asap",
        "Not great, very disappointed 😞",
        "A3, can't wait to see moer.",
        "I will be AFK for a while, but ASAP I'll respnd.",
    ]
})
with open('../slang_output.json', 'r') as file:
    chat_slang = json.load(file)

In [4]:
chat_slang.keys()

dict_keys(['AFAIK', 'AFK', 'ASAP', 'ATK', 'ATM', 'A3', 'BAK', 'BBL', 'BBS', 'BFN', 'B4N', 'BRB', 'BRT', 'BTW', 'B4', 'CU', 'CUL8R', 'CYA', 'FAQ', 'FC', 'FWIW', 'FYI', 'GAL', 'GG', 'GN', 'GMTA', 'GR8', 'G9', 'IC', 'ICQ', 'ILU', 'IMHO', 'IMO', 'IOW', 'IRL', 'KISS', 'LDR', 'LMAO', 'LOL', 'LTNS', 'L8R', 'MTE', 'M8', 'NRN', 'OIC', 'PITA', 'PRT', 'PRW', 'ROFL', 'ROFLOL', 'ROTFLMAO', 'SK8', 'STATS', 'ASL', 'THX', 'TTFN', 'TTYL', 'U', 'U2', 'U4E', 'WB', 'WTF', 'WTG', 'WUF', 'W8', '7K', 'TFW', 'MFW', 'MRW', 'IFYP', 'TNTL', 'JK', 'IDC', 'ILY', 'IMU', 'ADIH', 'ZZZ', 'WYWH', 'TIME', 'BAE', 'FIMH', 'BSAAW', 'BWL', 'BFF', 'CSL'])

In [5]:
def replace_abbreviations(column, slang_dict):
    # Function to replace abbreviations in a column (case-sensitive)
    for slang, full_form in slang_dict.items():
        column = column.str.replace_all(rf"\b{slang}\b", full_form)
    return column

In [6]:
def correct_spelling(text):
    # Function to correct spelling using TextBlob
    if text:
        blob = TextBlob(text)
        return str(blob.correct().string)
    return text

In [7]:
# List of columns to clean
columns_to_clean = ["review", "comments"]

In [8]:

def convert_emoji_to_text(text):
    return emoji.demojize(text)

In [9]:
def clean_column(column):
    # Function to clean text in each column
    # step 1: replace chat slang with abbreviation
    column = replace_abbreviations(column, chat_slang)
    # step 2: data cleanup
    column = (
        column.str.replace_all(r'https?://\S+|www\.\S+', '') # remove url
    .str.replace_all(r'<.*?>', '')  # remove html tags
    .str.replace_all(punctuation_pattern, '')  # remove punctuation marks
    .str.replace_all(r"[^\x00-\x7F]+", "")  # Optionally remove any non-ASCII characters
    .str.to_lowercase() # convert to lower case
    )
    return column

In [27]:
# Apply the cleaning function to each column
data = data.with_columns([
    clean_column(pl.col(col)).alias(col) for col in columns_to_clean
])

In [10]:
def remove_stop_words(text):
    new_text = []
    for words in text.split():
        if words in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(words.strip())
    return " ".join(new_text)

In [14]:
def clean_text_column(data: pl.DataFrame, columns_to_clean: list) -> pl.DataFrame:
    for col in columns_to_clean:
        data = data.with_columns([
            pl.col(col).map_elements(
                lambda x: remove_stop_words(x) if x else x,
                return_dtype=pl.Utf8
            ).alias(f"{col}_cleaned_stops"),
            pl.col(f"{col}_cleaned_stops").map_elements(
                lambda x: convert_emoji_to_text(x) if x else x,
                return_dtype=pl.Utf8
            ).alias(col)
        ])
    return data

In [22]:
print(data)

shape: (7, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ review                          ┆ comments                        │
│ ---                             ┆ ---                             │
│ str                             ┆ str                             │
╞═════════════════════════════════╪═════════════════════════════════╡
│ This is a  great product!!!     ┆ 😍Great service!! Highly        │
│                                 ┆ recomm…                         │
│ Check   it  out...  It works w… ┆ Not bad... could be improved!!… │
│ LMAO this product is gr8        ┆ null                            │
│ afaik, it wrks for me but ATM … ┆ I want it asap                  │
│ AFAIK, this is the best prodcu… ┆ Not great, very disappointed 😞 │
│ I love this product! 😊👍       ┆ A3, can't wait to see moer.     │
│ null                            ┆ I will be AFK for a while, but… │
└─────────────────────────────────┴─────────────────────────────────┘


In [23]:
for col in columns_to_clean:
    data = data.with_columns([
        pl.col("review").map_elements(
            lambda x: remove_stop_words(x) if x else x,
            return_dtype=pl.Utf8
        ).alias("review"),
    ])

In [25]:
for col in columns_to_clean:
    data = data.with_columns([
        pl.col("review").map_elements(
            lambda x: convert_emoji_to_text(x) if x else x,
            return_dtype=pl.Utf8
        ).alias("review"),
    ])

In [26]:
print(data)

shape: (7, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ review                          ┆ comments                        │
│ ---                             ┆ ---                             │
│ str                             ┆ str                             │
╞═════════════════════════════════╪═════════════════════════════════╡
│ This great product!!!           ┆ 😍Great service!! Highly        │
│                                 ┆ recomm…                         │
│ Check out... It works well, ri… ┆ Not bad... could be improved!!… │
│ LMAO product gr8                ┆ null                            │
│ afaik, wrks ATM I idea.         ┆ I want it asap                  │
│ AFAIK, best prodcut, ATM works… ┆ Not great, very disappointed 😞 │
│ I love product! :smiling_face_… ┆ A3, can't wait to see moer.     │
│ null                            ┆ I will be AFK for a while, but… │
└─────────────────────────────────┴─────────────────────────────────┘


# if there is a functionality which is not supported by polars you can conver Polars Dataframe to Pandas Dataframe and use Pandas functionality

In [None]:
data_pandas = data.to_pandas()
for col in columns_to_clean:
    data_pandas[col] = data_pandas[col].apply(lambda x: remove_stop_words(x) if x else x)
    data_pandas[col] = data_pandas[col].apply(lambda x: convert_emoji_to_text(x) if x else x)
data = pl.from_pandas(data_pandas)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
print(data)

shape: (7, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ review                          ┆ comments                        │
│ ---                             ┆ ---                             │
│ str                             ┆ str                             │
╞═════════════════════════════════╪═════════════════════════════════╡
│ this great product              ┆ great service highly recommend… │
│ check out it works well right   ┆ not bad could be improved       │
│ laughing my a off product gr8   ┆ null                            │
│ afaik wrks at the moment i ide… ┆ i want it asap                  │
│ as far as i know best prodcut … ┆ not great very disappointed     │
│ i love product smilingfacewith… ┆ anytime anywhere anyplace cant… │
│ null                            ┆ i will be away from keyboard f… │
└─────────────────────────────────┴─────────────────────────────────┘


In [146]:
data

review,comments
str,str
"""this great product""","""smilingfacewithhearteyesgreat …"
"""check out it works well right""","""not bad could improved"""
"""lmao product gr8""",
"""afaik wrks atm i idea""","""i want asap"""
"""afaik best prodcut atm works f…","""not great disappointed disappo…"
"""i love product smilingfacewith…","""a3 cant wait see moer"""
,"""i afk while asap ill respnd"""
