In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [49]:
df = pd.read_csv("sg_companies_reviews.csv")

In [50]:
df.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title        66
Job Title            5
Job Details          0
Location          2763
Pros                 0
Cons                 0
dtype: int64

In [51]:
df

## dates 45215 is excel format, convert from 1jan1900


Unnamed: 0,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons
0,Amazon,5,12-Feb-24,"Good pay, hard work",Area Manager II,"Former Employee, more than 3 years","Romulus, MI","Pay, autonomy, plenty of benefits",It was hard work in a warehouse. Many days of ...
1,Amazon,5,10-Jan-16,You Get What You Put In,Anonymous Employee,Current Employee,"Seattle, WA","Really smart people, a lot of opportunity for ...",You have to be self motivated. NO ONE will hol...
2,Amazon,3,20-Feb-16,"Exciting Work, Abusive Culture",Senior Engineering Manager,"Current Employee, more than 8 years","Seattle, WA","Jeff Bezos and his ""S-Team"" are brilliant and ...","The management process is abusive, and I'm cur..."
3,Amazon,4,17-Dec-13,"Can be amazing for some people, horrible for o...",Software Development Manager,"Current Employee, more than 3 years","Seattle, WA",Amazon is doing lot's of cool stuff...but lots...,- You're responsible for your own career progr...
4,Amazon,5,23-Feb-18,An Amazing Place to Work,Software Development Manager,"Current Employee, less than 1 year","Phoenix, AZ","I've been at Amazon for a month now, and I've ...","No cons, so far - seriously. Like I said, I'm ..."
...,...,...,...,...,...,...,...,...,...
10030,NCR,4,45215,"Good, but could be better",Customer Engineer,"Former Employee, more than 1 year","Miami, FL","Management was good, solid training, opportuni...","Inconsistent schedule, growth within the compa..."
10031,NCR,5,45222,Great starting position,Sr. Software Support Engineer,Former Employee,"Atlanta, GA",Learn a lot Develop skills,There was a lot of overtime
10032,NCR,4,45143,"Great Company, on call rotation",ATM Technician,"Former Employee, more than 1 year","Springfield, MO","Great Company. Provided a vehicle, and covered...","Lots of travel, and rotating ""On call"". Very l..."
10033,NCR,1,45282,Na,Analyst,"Former Employee, more than 3 years","Atlanta, GA",Located in midtown and nice building,"No pay increase, revolving door of managers, a..."


In [52]:
def format_date(x, input_format = "%d-%b-%y", ignore_errors = False):
    try:
        return datetime.strptime(x, input_format)
    except Exception as e:
        if ignore_errors:
            return x
        else:
            raise e
    
def format_non_conventional_dates(x):
    if isinstance(x, datetime):
        return x
    if not isinstance(x, float):
        try:
            x = float(x)
        except:
            print(f"{x} unknown date type")
            return x
    return pd.Timedelta(x, unit='d') + datetime(1899, 12, 30)

def clean_df_dates(df, date_col, **args):
    df[date_col] = df[date_col].apply(lambda x: format_date(x, **args))
    df[date_col] = df[date_col].apply(lambda x: format_non_conventional_dates(x))
    return df

df = clean_df_dates(df, "Review Date", ignore_errors = True)

In [53]:
def clean_yj_style(df):
    df.replace('#NAME?', pd.NA, inplace=True) # replace '#NAME?' with NaN
    # Replace '' with NaN
    df.replace('', pd.NA, inplace=True) # replace '' with NaN
    # df.replace('•', '', inplace=True) # replace '•' with ''
    # df.replace('’', "'", inplace=True) # replace '’' with "'"
    df.replace('\u2022', '', regex=True, inplace=True) # replace '•' with ''
    df.replace('\u2019', "'", regex=True, inplace=True) # replace '’' with "'"
    df.replace('\r\n\-', ' ', regex=True, inplace=True) # replace '\r\n-' with ' '
    df.replace('\r\n\*', ' ', regex=True, inplace=True) # replace '\r\n*' with ' '
    df.replace('\r\n•', ' ', regex=True, inplace=True) # replace '\r\n•' with ' '
    df.replace('\n\r\-', ' ', regex=True, inplace=True) # replace '\n\r-' with ' '
    df.replace('\n\r\*', ' ', regex=True, inplace=True) # replace '\n\r*' with ' '
    df.replace('\n\r•', ' ', regex=True, inplace=True) # replace '\n\r•' with ' '
    df.replace('\n\-', ' ', regex=True, inplace=True) # replace '\n-' with ' '
    df.replace('\n\*', '', regex=True, inplace=True) # replace '\n*' with ' '
    df.replace('\n•', ' ', inplace=True) # replace '\n•' with ' '
    df.replace('\r-', ' ', inplace=True) # replace '\r-' with ' '
    df.replace('\r*', ' ', inplace=True) # replace '\r*' with ' '
    df.replace('\r•', ' ', inplace=True) # replace '\r•' with ' '
    df.replace('\n',' ', regex=True, inplace=True) # replace '\n' with ' '
    df.replace('\r',' ', regex=True, inplace=True) # replace '\r' with ' '
    df.replace('\t',' ', regex=True, inplace=True) # replace '\t' with ' '
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    return df

In [54]:
from cleantext import clean
import emoji
import re

def convert_emoji_to_text(emoji_text):
    try:
        text_with_aliases = emoji.demojize(emoji_text, delimiters=("", "_emoji "))
    except:
        return emoji_text
    return text_with_aliases

def is_ascii(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

def get_non_ascii_strings(df, col):
    # Use apply function to check each value in the specified column
    non_ascii_mask = df[col].apply(lambda x: not is_ascii(str(x)))

    # Return a DataFrame containing non-ASCII strings
    return df[non_ascii_mask]

def clean_text(df, col):
    df = df.copy()
    df[col] = df[col].apply(lambda x: clean(x, no_emoji=False, no_punct=True) if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: convert_emoji_to_text(x))
    df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: re.sub(r' {2,}', ' ', x) if isinstance(x, str) else x)
    return df

text_columns = ['Review Title', 'Job Title', 'Job Details', 'Pros', 'Cons']
df_clean = clean_yj_style(df)
for column_to_check in text_columns:
    df_clean = clean_text(df_clean, column_to_check)
    result = get_non_ascii_strings(df_clean, column_to_check)
    print(f"Non-ASCII strings in '{column_to_check}':")
    print(len(result))
    if len(result) > 0:
        display(result)


Non-ASCII strings in 'Review Title':
0
Non-ASCII strings in 'Job Title':
0
Non-ASCII strings in 'Job Details':
0
Non-ASCII strings in 'Pros':
0
Non-ASCII strings in 'Cons':
0


In [55]:
df_clean.dtypes

Company Name              object
Overall Rating             int64
Review Date       datetime64[ns]
Review Title              object
Job Title                 object
Job Details               object
Location                  object
Pros                      object
Cons                      object
dtype: object

In [56]:
print(df.iloc[4088]['Review Title'])
print("==========================")
print("After cleaning")
print("==========================")
print(df_clean.iloc[4088]['Review Title'])

💩Jaipur sitapura uper management
After cleaning
pile_of_poo_emoji jaipur sitapura uper management


In [57]:
df_clean.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title        66
Job Title            5
Job Details          0
Location          2763
Pros               409
Cons               311
dtype: int64

In [59]:
df_clean.to_csv("sg_companies_reviews_clean.csv", index = False)

## Indexing classes

In [1]:
from indexers import LeaderIndexer
from encoders import TermFrequencyEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
df = pd.read_csv("data/sg_companies_reviews_clean.csv")

In [24]:
from retrievers import BaseRetriever
from enums import *
encoder_type = EncoderType("TermFrequencyEncoder")
indexer_type = IndexerType("LeaderIndexer")

df = df.dropna(subset="Review Title").reset_index(drop=True)
corpus = df['Review Title'].dropna().to_numpy()
query = "bad abuse terrible horrible trash"
retriever = BaseRetriever(encoder_type, indexer_type, corpus, encoder_kwargs={}, indexer_kwargs={"n_clusters": 1})
results = retriever.retrieve_results(query)

In [25]:
df.iloc[results]

Unnamed: 0,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons
7782,Lumen,1,2024-02-19,trash,sales support associate,current employee,,not a single pro here,they canned everyone in my position
7358,Agoda,1,2024-01-16,horrible,i rather not say,current employee,,some people are nice thailand is a beautiful c...,toxic environment bad leadership people who sh...
5186,Expedia Group,1,2023-11-19,horrible place,product manager,current employee more than 3 years,"Seattle, WA",work life balance is good,politics cross division fights
3987,Tech Mahindra,1,2023-12-21,terrible,associate software engineer,former employee more than 1 year,"Dallas, TX",none place was not good whatsoever,almost no training bad management
8915,Tripadvisor,1,2022-10-03,terrible,business development representative bdr,current employee,"Columbia, SC",there are none terrible job dont do it,low pay tons of calls product is useless makes...
...,...,...,...,...,...,...,...,...,...
6580,Synapxe,5,2023-07-12,great work satisfaction,senior systems analyst,former employee more than 5 years,"Serangoon,",having to take care of a national healthcare s...,career stagnant as a new hire is often the choice
6579,Synapxe,4,2023-07-16,good company to work with provided in the righ...,lead engineer,current employee more than 3 years,"Serangoon,",focus on more management rather than technical...,doesnt have a clear technical career track
6578,Synapxe,3,2023-06-28,not a place to stay for long,application analyst,current employee,"Serangoon,",,
6577,Synapxe,4,2023-07-24,great opportunity for fresh grads but toxic en...,systems analyst,former employee more than 1 year,Singapore,good stepping stone for fresh grads good learn...,toxic environment skillsets under utilized
