In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
df = pd.read_csv("data/new_sg_companies_reviews.csv")

In [None]:
def format_date(x, input_format = "%d-%b-%y", ignore_errors = False):
    try:
        return datetime.strptime(x, input_format)
    except Exception as e:
        if ignore_errors:
            return x
        else:
            raise e
    
def format_non_conventional_dates(x):
    if isinstance(x, datetime):
        return x
    if not isinstance(x, float):
        try:
            x = float(x)
        except:
            print(f"{x} unknown date type")
            return x
    return pd.Timedelta(x, unit='d') + datetime(1899, 12, 30)

def clean_df_dates(df, date_col, **args):
    df[date_col] = df[date_col].apply(lambda x: format_date(x, input_format = "%d-%b-%y", **args))
    df[date_col] = df[date_col].apply(lambda x: format_date(x, input_format = "%b %d, %Y", **args))
    df[date_col] = df[date_col].apply(lambda x: format_non_conventional_dates(x))
    return df

df = clean_df_dates(df, "Review Date", ignore_errors = True)

In [None]:
df = df.reset_index(names="id")
df['id'] = df.apply(lambda x: f"{x['Review Date']}{x['Review Title']}{x['id']}", axis = 1)
df.to_csv("data/new_sg_companies_reviews_UID.csv", index = False)

In [None]:
from cleantext import clean
import emoji
import re

def clean_yj_style(df):
    df.replace('#NAME?', pd.NA, inplace=True) # replace '#NAME?' with NaN
    # Replace '' with NaN
    df.replace('', pd.NA, inplace=True) # replace '' with NaN
    # df.replace('•', '', inplace=True) # replace '•' with ''
    # df.replace('’', "'", inplace=True) # replace '’' with "'"
    df.replace('\u2022', '', regex=True, inplace=True) # replace '•' with ''
    df.replace('\u2019', "'", regex=True, inplace=True) # replace '’' with "'"
    df.replace('\r\n\-', ' ', regex=True, inplace=True) # replace '\r\n-' with ' '
    df.replace('\r\n\*', ' ', regex=True, inplace=True) # replace '\r\n*' with ' '
    df.replace('\r\n•', ' ', regex=True, inplace=True) # replace '\r\n•' with ' '
    df.replace('\n\r\-', ' ', regex=True, inplace=True) # replace '\n\r-' with ' '
    df.replace('\n\r\*', ' ', regex=True, inplace=True) # replace '\n\r*' with ' '
    df.replace('\n\r•', ' ', regex=True, inplace=True) # replace '\n\r•' with ' '
    df.replace('\n\-', ' ', regex=True, inplace=True) # replace '\n-' with ' '
    df.replace('\n\*', '', regex=True, inplace=True) # replace '\n*' with ' '
    df.replace('\n•', ' ', inplace=True) # replace '\n•' with ' '
    df.replace('\r-', ' ', inplace=True) # replace '\r-' with ' '
    df.replace('\r*', ' ', inplace=True) # replace '\r*' with ' '
    df.replace('\r•', ' ', inplace=True) # replace '\r•' with ' '
    df.replace('\n',' ', regex=True, inplace=True) # replace '\n' with ' '
    df.replace('\r',' ', regex=True, inplace=True) # replace '\r' with ' '
    df.replace('\t',' ', regex=True, inplace=True) # replace '\t' with ' '
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    return df

def convert_emoji_to_text(emoji_text):
    try:
        text_with_aliases = emoji.demojize(emoji_text, delimiters=("", "_emoji "))
    except:
        return emoji_text
    return text_with_aliases

def is_ascii(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

def get_non_ascii_strings(df, col):
    # Use apply function to check each value in the specified column
    non_ascii_mask = df[col].apply(lambda x: not is_ascii(str(x)))

    # Return a DataFrame containing non-ASCII strings
    return df[non_ascii_mask]

def clean_text(df, col):
    df = df.copy()
    df[col] = df[col].apply(lambda x: clean(x, no_emoji=False, no_punct=True) if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: convert_emoji_to_text(x))
    df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: re.sub(r' {2,}', ' ', x) if isinstance(x, str) else x)
    return df


In [None]:
index_column = ['id']
df_index = df[index_column]
df = df.drop(columns=index_column)

text_columns = ['Review Title', 'Job Title', 'Job Details', 'Pros', 'Cons']
df_clean = clean_yj_style(df)
for column_to_check in text_columns:
    df_clean = clean_text(df_clean, column_to_check)
    result = get_non_ascii_strings(df_clean, column_to_check)
    print(f"Non-ASCII strings in '{column_to_check}':")
    print(len(result))
    if len(result) > 0:
        display(result)

df_clean = pd.concat([df_index, df_clean], axis = 1)

In [None]:
df_clean = df_clean.drop_duplicates(subset = [i for i in df_clean.columns if i != 'id']).reset_index(drop = True)
df_clean.to_csv("data/new_sg_companies_reviews_clean_UID.csv", index = False)

### Example

In [None]:
test_df = pd.DataFrame({
    "Review Title": ["this company is so bad. STAY AWAY 🤬"],
    "Job Title": ["Example job title"],
    "Job Details": ["Example job details"],
    "Pros": ["the pros are: \n- there are none"],
    "Cons": ["cons: \n• it pays poorly \n• the benefits are 💩 \n• MANAGEMENT SUCKS!!!"]
})

text_columns = ['Review Title', 'Pros', 'Cons']
test_df_clean = clean_yj_style(test_df.copy())
for column_to_check in text_columns:
    test_df_clean = clean_text(test_df_clean, column_to_check)

print("\n############### Before -> after comparison ###############\n")
for column_to_check in text_columns:
    print("==========================")
    print(column_to_check)
    print("==========================")
    print(test_df.iloc[0][column_to_check], "->", test_df_clean.iloc[0][column_to_check])
    print("\n")


############### Before -> after comparison ###############

Review Title
this company is so bad. STAY AWAY 🤬 -> this company is so bad stay away face_with_symbols_on_mouth_emoji


Pros
the pros are: 
- there are none -> the pros are there are none


Cons
cons: 
• it pays poorly 
• the benefits are 💩 
• MANAGEMENT SUCKS!!! -> cons it pays poorly the benefits are pile_of_poo_emoji management sucks


