In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [50]:
df = pd.read_csv("data/new_sg_companies_reviews.csv")

In [51]:
df.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title       109
Job Title            8
Job Details          0
Location          5772
Pros                 0
Cons                 0
dtype: int64

In [52]:
df.shape


(20035, 9)

In [53]:
df

Unnamed: 0,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons
0,Amazon,5.0,12-Feb-24,"Good pay, hard work",Area Manager II,"Former Employee, more than 3 years","Romulus, MI","Pay, autonomy, plenty of benefits",It was hard work in a warehouse. Many days of ...
1,Amazon,5.0,10-Jan-16,You Get What You Put In,Anonymous Employee,Current Employee,"Seattle, WA","Really smart people, a lot of opportunity for ...",You have to be self motivated. NO ONE will hol...
2,Amazon,3.0,20-Feb-16,"Exciting Work, Abusive Culture",Senior Engineering Manager,"Current Employee, more than 8 years","Seattle, WA","Jeff Bezos and his ""S-Team"" are brilliant and ...","The management process is abusive, and I'm cur..."
3,Amazon,4.0,17-Dec-13,"Can be amazing for some people, horrible for o...",Software Development Manager,"Current Employee, more than 3 years","Seattle, WA",Amazon is doing lot's of cool stuff...but lots...,- You're responsible for your own career progr...
4,Amazon,5.0,23-Feb-18,An Amazing Place to Work,Software Development Manager,"Current Employee, less than 1 year","Phoenix, AZ","I've been at Amazon for a month now, and I've ...","No cons, so far - seriously. Like I said, I'm ..."
...,...,...,...,...,...,...,...,...,...
20030,Informatica,2.0,"May 26, 2023",Not exactly,Data Scientist,"Former Employee, less than 1 year","Austin, TX","Great Office, location, and some really good p...","Micro managing , convenient memory loss , mana..."
20031,Informatica,1.0,"Feb 25, 2022",Micro managers R US (and we've got spreadsheet...,Developer,"Former Employee, more than 3 years",,"It's a funny company, Informatica. They like t...",The biggest con? They do not keep money promis...
20032,Informatica,3.0,"May 15, 2023",easy job but alot of changes,Business Development Manager,"Current Employee, more than 1 year","Austin, TX",great people good pay free trips,way to many business changes it is hard to kee...
20033,Informatica,5.0,"Sep 30, 2022",Great place to work,Director,"Current Employee, less than 1 year",,The company has a great cuture and the nice th...,Big portfolio of products so there is a lot to...


In [54]:
def format_date(x, input_format = "%d-%b-%y", ignore_errors = False):
    try:
        return datetime.strptime(x, input_format)
    except Exception as e:
        if ignore_errors:
            return x
        else:
            raise e
    
def format_non_conventional_dates(x):
    if isinstance(x, datetime):
        return x
    if not isinstance(x, float):
        try:
            x = float(x)
        except:
            print(f"{x} unknown date type")
            return x
    return pd.Timedelta(x, unit='d') + datetime(1899, 12, 30)

def clean_df_dates(df, date_col, **args):
    df[date_col] = df[date_col].apply(lambda x: format_date(x, input_format = "%d-%b-%y", **args))
    df[date_col] = df[date_col].apply(lambda x: format_date(x, input_format = "%b %d, %Y", **args))
    df[date_col] = df[date_col].apply(lambda x: format_non_conventional_dates(x))
    return df

df = clean_df_dates(df, "Review Date", ignore_errors = True)

In [55]:
def clean_yj_style(df):
    df.replace('#NAME?', pd.NA, inplace=True) # replace '#NAME?' with NaN
    # Replace '' with NaN
    df.replace('', pd.NA, inplace=True) # replace '' with NaN
    # df.replace('•', '', inplace=True) # replace '•' with ''
    # df.replace('’', "'", inplace=True) # replace '’' with "'"
    df.replace('\u2022', '', regex=True, inplace=True) # replace '•' with ''
    df.replace('\u2019', "'", regex=True, inplace=True) # replace '’' with "'"
    df.replace('\r\n\-', ' ', regex=True, inplace=True) # replace '\r\n-' with ' '
    df.replace('\r\n\*', ' ', regex=True, inplace=True) # replace '\r\n*' with ' '
    df.replace('\r\n•', ' ', regex=True, inplace=True) # replace '\r\n•' with ' '
    df.replace('\n\r\-', ' ', regex=True, inplace=True) # replace '\n\r-' with ' '
    df.replace('\n\r\*', ' ', regex=True, inplace=True) # replace '\n\r*' with ' '
    df.replace('\n\r•', ' ', regex=True, inplace=True) # replace '\n\r•' with ' '
    df.replace('\n\-', ' ', regex=True, inplace=True) # replace '\n-' with ' '
    df.replace('\n\*', '', regex=True, inplace=True) # replace '\n*' with ' '
    df.replace('\n•', ' ', inplace=True) # replace '\n•' with ' '
    df.replace('\r-', ' ', inplace=True) # replace '\r-' with ' '
    df.replace('\r*', ' ', inplace=True) # replace '\r*' with ' '
    df.replace('\r•', ' ', inplace=True) # replace '\r•' with ' '
    df.replace('\n',' ', regex=True, inplace=True) # replace '\n' with ' '
    df.replace('\r',' ', regex=True, inplace=True) # replace '\r' with ' '
    df.replace('\t',' ', regex=True, inplace=True) # replace '\t' with ' '
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    return df

In [56]:
from cleantext import clean
import emoji
import re

def convert_emoji_to_text(emoji_text):
    try:
        text_with_aliases = emoji.demojize(emoji_text, delimiters=("", "_emoji "))
    except:
        return emoji_text
    return text_with_aliases

def is_ascii(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

def get_non_ascii_strings(df, col):
    # Use apply function to check each value in the specified column
    non_ascii_mask = df[col].apply(lambda x: not is_ascii(str(x)))

    # Return a DataFrame containing non-ASCII strings
    return df[non_ascii_mask]

def clean_text(df, col):
    df = df.copy()
    df[col] = df[col].apply(lambda x: clean(x, no_emoji=False, no_punct=True) if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: convert_emoji_to_text(x))
    df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: re.sub(r' {2,}', ' ', x) if isinstance(x, str) else x)
    return df

text_columns = ['Review Title', 'Job Title', 'Job Details', 'Pros', 'Cons']
df_clean = clean_yj_style(df)
for column_to_check in text_columns:
    df_clean = clean_text(df_clean, column_to_check)
    result = get_non_ascii_strings(df_clean, column_to_check)
    print(f"Non-ASCII strings in '{column_to_check}':")
    print(len(result))
    if len(result) > 0:
        display(result)


Non-ASCII strings in 'Review Title':
0
Non-ASCII strings in 'Job Title':
0
Non-ASCII strings in 'Job Details':
0
Non-ASCII strings in 'Pros':
0
Non-ASCII strings in 'Cons':
0


In [58]:
print(df.iloc[0]['Review Title'])
print("==========================")
print("After cleaning")
print("==========================")
print(df_clean.iloc[0]['Review Title'])

Good pay, hard work
After cleaning
good pay hard work


In [57]:
df_clean.dtypes

Company Name              object
Overall Rating           float64
Review Date       datetime64[ns]
Review Title              object
Job Title                 object
Job Details               object
Location                  object
Pros                      object
Cons                      object
dtype: object

In [61]:
df_clean = df_clean.drop_duplicates().reset_index(drop = True)
df_clean.shape

(18969, 9)

In [65]:
df_clean.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title       101
Job Title            7
Job Details          0
Location          5532
Pros               362
Cons               281
dtype: int64

In [64]:
df_clean.to_csv("data/new_sg_companies_reviews_clean.csv", index = False)

## Indexing classes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time

In [2]:
df = pd.read_csv("data/final_sg_companies_reviews_clean.csv")

In [25]:
from retrievers import BaseRetriever
from enums import EncoderType, IndexerType
# encoder_type = EncoderType("TermFrequencyEncoder")
encoder_type = EncoderType("SentenceTransformerEncoder")
indexer_type = IndexerType("LeaderIndexer")
corpus = df['Review Title'].to_numpy()

start_indexing = time.time()
retriever = BaseRetriever(encoder_type, indexer_type, corpus, encoder_kwargs={}, indexer_kwargs={"n_clusters": 1, 'use_pca': False, 'min_explained_var': 0.7})
end_indexing = time.time()
print(f"time taken for indexing: {end_indexing - start_indexing}s")

time taken for indexing: 42.09081482887268s


In [30]:
q1 = "I want to see good MONEY, CULTURE AND WORK LIFE BALANCE"
q2 = "career progression???"
q3 = "the worst companies to work at"

start_retrieving = time.time()
results = retriever.retrieve_results(q3)
end_retrieving = time.time()
print(f"time taken for retrieval: {end_retrieving - start_retrieving}s")

time taken for retrieval: 0.04564237594604492s


In [31]:
df.iloc[results]

Unnamed: 0,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons
14577,FUJIFILM Business Innovation,1.0,2022-10-16,worst company to work in,account manager,former employee,,nothing petrol card is convenient other than t...,management has tons of reports that are actual...
9575,Pactera,1.0,2017-08-24,one of the worst companies to work for,anonymous employee,former employee,,nothing i can think of,one of the worst companies to work for spent m...
16348,OpenText,1.0,2024-01-25,worst company to work for,anonymous,current employee more than 8 years,,there are no positive things about working her...,cheapest company ive ever worked for health be...
16802,Coforge,1.0,2023-02-08,worst company to work for,talent acquisition,current employee more than 5 years,,it used to be a employee friendly company but ...,management look for numbers rather then employ...
16425,Hexaware Technologies,1.0,2023-03-29,worst company to work,sr software engineer,current employee less than 1 year,"Dallas, TX",there are no pros especially if you are in us ...,gender discrimination and there is no one in f...
...,...,...,...,...,...,...,...,...,...
3824,FDM Group,4.0,2023-12-20,great support,consultant,current employee more than 3 years,"Austin, TX",there are several paths you can take once you ...,there are a lot of candidates that are trying ...
16114,BMC Software,5.0,2023-11-14,great support,sre engineer,current employee less than 1 year,Tel Aviv-Yafo,as a new employee i can say that bmc is very s...,so far none that i can see
3418,Seagate Technology,5.0,2024-01-15,pleased,manager,current employee more than 10 years,"Oklahoma City, OK",company is transparent and cares about employees,i do not have any cons
15468,Orange Business,5.0,2024-02-14,good compony,customer service representative,current employee,"Cairo, Cairo Governorate",the good environment and helpful people,the community the good environment and helpful...
