In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [41]:
df = pd.read_csv("sg_companies_reviews.csv")

In [42]:
df.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title        66
Job Title            5
Job Details          0
Location          2763
Pros                 0
Cons                 0
dtype: int64

In [43]:
def format_date(x, input_format = "%d-%b-%y", ignore_errors = False):
    try:
        return datetime.strptime(x, input_format)
    except Exception as e:
        if ignore_errors:
            return x
        else:
            raise e

def clean_df_dates(df, date_col, **args):
    df[date_col] = df[date_col].apply(lambda x: format_date(x, **args))
    return df

df = clean_df_dates(df, "Review Date", ignore_errors = True)

In [52]:
from cleantext import clean
import emoji

def convert_emoji_to_text(emoji_text):
    try:
        text_with_aliases = emoji.demojize(emoji_text, delimiters=("<", ">"))
    except:
        return emoji_text
    return text_with_aliases

def is_ascii(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

def get_non_ascii_strings(df, col):
    # Use apply function to check each value in the specified column
    non_ascii_mask = df[col].apply(lambda x: not is_ascii(str(x)))

    # Return a DataFrame containing non-ASCII strings
    return df[non_ascii_mask]

def clean_text(df, col):
    df = df.copy()
    df[col] = df[col].apply(lambda x: clean(x, no_emoji=False, no_punct=True))
    df[col] = df[col].apply(lambda x: convert_emoji_to_text(x))
    return df

text_columns = ['Review Title', 'Job Title', 'Job Details', 'Pros', 'Cons']
df_clean = df.copy()
for column_to_check in text_columns:
    df_clean = clean_text(df_clean, column_to_check)
    result = get_non_ascii_strings(df_clean, column_to_check)
    print(f"Non-ASCII strings in '{column_to_check}':")
    print(len(result))
    if len(result) > 0:
        display(result)


Non-ASCII strings in 'Review Title':
0
Non-ASCII strings in 'Job Title':
0
Non-ASCII strings in 'Job Details':
0
Non-ASCII strings in 'Pros':
0
Non-ASCII strings in 'Cons':
0


In [59]:
print(df.iloc[4088]['Review Title'])
print("==========================")
print("After cleaning")
print("==========================")
print(df_clean.iloc[4088]['Review Title'])

💩Jaipur sitapura uper management
After cleaning
<pile_of_poo>jaipur sitapura uper management
