In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [50]:
df = pd.read_csv("data/new_sg_companies_reviews.csv")

In [51]:
df.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title       109
Job Title            8
Job Details          0
Location          5772
Pros                 0
Cons                 0
dtype: int64

In [52]:
df.shape


(20035, 9)

In [53]:
df

Unnamed: 0,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons
0,Amazon,5.0,12-Feb-24,"Good pay, hard work",Area Manager II,"Former Employee, more than 3 years","Romulus, MI","Pay, autonomy, plenty of benefits",It was hard work in a warehouse. Many days of ...
1,Amazon,5.0,10-Jan-16,You Get What You Put In,Anonymous Employee,Current Employee,"Seattle, WA","Really smart people, a lot of opportunity for ...",You have to be self motivated. NO ONE will hol...
2,Amazon,3.0,20-Feb-16,"Exciting Work, Abusive Culture",Senior Engineering Manager,"Current Employee, more than 8 years","Seattle, WA","Jeff Bezos and his ""S-Team"" are brilliant and ...","The management process is abusive, and I'm cur..."
3,Amazon,4.0,17-Dec-13,"Can be amazing for some people, horrible for o...",Software Development Manager,"Current Employee, more than 3 years","Seattle, WA",Amazon is doing lot's of cool stuff...but lots...,- You're responsible for your own career progr...
4,Amazon,5.0,23-Feb-18,An Amazing Place to Work,Software Development Manager,"Current Employee, less than 1 year","Phoenix, AZ","I've been at Amazon for a month now, and I've ...","No cons, so far - seriously. Like I said, I'm ..."
...,...,...,...,...,...,...,...,...,...
20030,Informatica,2.0,"May 26, 2023",Not exactly,Data Scientist,"Former Employee, less than 1 year","Austin, TX","Great Office, location, and some really good p...","Micro managing , convenient memory loss , mana..."
20031,Informatica,1.0,"Feb 25, 2022",Micro managers R US (and we've got spreadsheet...,Developer,"Former Employee, more than 3 years",,"It's a funny company, Informatica. They like t...",The biggest con? They do not keep money promis...
20032,Informatica,3.0,"May 15, 2023",easy job but alot of changes,Business Development Manager,"Current Employee, more than 1 year","Austin, TX",great people good pay free trips,way to many business changes it is hard to kee...
20033,Informatica,5.0,"Sep 30, 2022",Great place to work,Director,"Current Employee, less than 1 year",,The company has a great cuture and the nice th...,Big portfolio of products so there is a lot to...


In [54]:
def format_date(x, input_format = "%d-%b-%y", ignore_errors = False):
    try:
        return datetime.strptime(x, input_format)
    except Exception as e:
        if ignore_errors:
            return x
        else:
            raise e
    
def format_non_conventional_dates(x):
    if isinstance(x, datetime):
        return x
    if not isinstance(x, float):
        try:
            x = float(x)
        except:
            print(f"{x} unknown date type")
            return x
    return pd.Timedelta(x, unit='d') + datetime(1899, 12, 30)

def clean_df_dates(df, date_col, **args):
    df[date_col] = df[date_col].apply(lambda x: format_date(x, input_format = "%d-%b-%y", **args))
    df[date_col] = df[date_col].apply(lambda x: format_date(x, input_format = "%b %d, %Y", **args))
    df[date_col] = df[date_col].apply(lambda x: format_non_conventional_dates(x))
    return df

df = clean_df_dates(df, "Review Date", ignore_errors = True)

In [55]:
def clean_yj_style(df):
    df.replace('#NAME?', pd.NA, inplace=True) # replace '#NAME?' with NaN
    # Replace '' with NaN
    df.replace('', pd.NA, inplace=True) # replace '' with NaN
    # df.replace('•', '', inplace=True) # replace '•' with ''
    # df.replace('’', "'", inplace=True) # replace '’' with "'"
    df.replace('\u2022', '', regex=True, inplace=True) # replace '•' with ''
    df.replace('\u2019', "'", regex=True, inplace=True) # replace '’' with "'"
    df.replace('\r\n\-', ' ', regex=True, inplace=True) # replace '\r\n-' with ' '
    df.replace('\r\n\*', ' ', regex=True, inplace=True) # replace '\r\n*' with ' '
    df.replace('\r\n•', ' ', regex=True, inplace=True) # replace '\r\n•' with ' '
    df.replace('\n\r\-', ' ', regex=True, inplace=True) # replace '\n\r-' with ' '
    df.replace('\n\r\*', ' ', regex=True, inplace=True) # replace '\n\r*' with ' '
    df.replace('\n\r•', ' ', regex=True, inplace=True) # replace '\n\r•' with ' '
    df.replace('\n\-', ' ', regex=True, inplace=True) # replace '\n-' with ' '
    df.replace('\n\*', '', regex=True, inplace=True) # replace '\n*' with ' '
    df.replace('\n•', ' ', inplace=True) # replace '\n•' with ' '
    df.replace('\r-', ' ', inplace=True) # replace '\r-' with ' '
    df.replace('\r*', ' ', inplace=True) # replace '\r*' with ' '
    df.replace('\r•', ' ', inplace=True) # replace '\r•' with ' '
    df.replace('\n',' ', regex=True, inplace=True) # replace '\n' with ' '
    df.replace('\r',' ', regex=True, inplace=True) # replace '\r' with ' '
    df.replace('\t',' ', regex=True, inplace=True) # replace '\t' with ' '
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('- ') if isinstance(x, str) else x)
    df['Pros'] = df['Pros'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    df['Cons'] = df['Cons'].map(lambda x: x.lstrip('* ') if isinstance(x, str) else x)
    return df

In [56]:
from cleantext import clean
import emoji
import re

def convert_emoji_to_text(emoji_text):
    try:
        text_with_aliases = emoji.demojize(emoji_text, delimiters=("", "_emoji "))
    except:
        return emoji_text
    return text_with_aliases

def is_ascii(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

def get_non_ascii_strings(df, col):
    # Use apply function to check each value in the specified column
    non_ascii_mask = df[col].apply(lambda x: not is_ascii(str(x)))

    # Return a DataFrame containing non-ASCII strings
    return df[non_ascii_mask]

def clean_text(df, col):
    df = df.copy()
    df[col] = df[col].apply(lambda x: clean(x, no_emoji=False, no_punct=True) if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: convert_emoji_to_text(x))
    df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    df[col] = df[col].apply(lambda x: re.sub(r' {2,}', ' ', x) if isinstance(x, str) else x)
    return df

text_columns = ['Review Title', 'Job Title', 'Job Details', 'Pros', 'Cons']
df_clean = clean_yj_style(df)
for column_to_check in text_columns:
    df_clean = clean_text(df_clean, column_to_check)
    result = get_non_ascii_strings(df_clean, column_to_check)
    print(f"Non-ASCII strings in '{column_to_check}':")
    print(len(result))
    if len(result) > 0:
        display(result)


Non-ASCII strings in 'Review Title':
0
Non-ASCII strings in 'Job Title':
0
Non-ASCII strings in 'Job Details':
0
Non-ASCII strings in 'Pros':
0
Non-ASCII strings in 'Cons':
0


In [58]:
print(df.iloc[0]['Review Title'])
print("==========================")
print("After cleaning")
print("==========================")
print(df_clean.iloc[0]['Review Title'])

Good pay, hard work
After cleaning
good pay hard work


In [57]:
df_clean.dtypes

Company Name              object
Overall Rating           float64
Review Date       datetime64[ns]
Review Title              object
Job Title                 object
Job Details               object
Location                  object
Pros                      object
Cons                      object
dtype: object

In [61]:
df_clean = df_clean.drop_duplicates().reset_index(drop = True)
df_clean.shape

(18969, 9)

In [65]:
df_clean.isna().sum()

Company Name         0
Overall Rating       0
Review Date          0
Review Title       101
Job Title            7
Job Details          0
Location          5532
Pros               362
Cons               281
dtype: int64

In [64]:
df_clean.to_csv("data/new_sg_companies_reviews_clean.csv", index = False)

## Indexing classes

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time

In [8]:
df = pd.read_csv("data/final_sg_companies_reviews_clean.csv")

In [45]:
from retrievers import BaseRetriever
from enums import *
# encoder_type = EncoderType("TermFrequencyEncoder")
encoder_type = EncoderType("SentenceTransformerEncoder")
indexer_type = IndexerType("LeaderIndexer")
corpus = df['Review Title'].to_numpy()

start_indexing = time.time()
retriever = BaseRetriever(encoder_type, indexer_type, corpus, encoder_kwargs={}, indexer_kwargs={"n_clusters": 1})
end_indexing = time.time()
print(f"time taken for indexing: {end_indexing - start_indexing}s")

time taken for indexing: 40.112764835357666s


In [48]:
query = "I want to see good MONEY, CULTURE AND WORK LIFE BALANCE"
query_two = "career progression???"

start_retrieving = time.time()
results = retriever.retrieve_results(query_two)
end_retrieving = time.time()
print(f"time taken for retrieval: {end_retrieving - start_retrieving}s")

time taken for retrieval: 0.08025193214416504s


In [53]:
df.iloc[results]

Unnamed: 0,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons
11461,eBay,4.0,2024-02-27,career growth,software engineer,former employee,"Austin, TX",benefits and pay great chance to work with tal...,nothing to share about cons
4286,Expedia Group,3.0,2023-06-07,career growth,data specialist,former employee more than 10 years,"Dallas, TX",worked with great people in my formal departme...,no communication or status updates from the hr...
8341,NielsenIQ,1.0,2024-02-02,career growth,data processing analyst,former employee more than 1 year,Chennai,work culture office environment some area,no skill development no benefits no hike perks
9523,CrimsonLogic,3.0,2023-05-19,low career progression,commercial manager,former employee more than 1 year,Singapore,comfortable working environment good colleagues,a lot of products are not working can sell can...
1552,NCS,5.0,2021-11-14,career and growth,infra solution sales specialist,current employee more than 5 years,,ncs has a good career and growth path we will ...,ncs has many employees takes time to understan...
...,...,...,...,...,...,...,...,...,...
8179,Xilinx,5.0,2015-03-04,xilinx solid place to work,anonymous employee,current employee,,xilinx is a cutting edge industry leading orga...,as with any position and company the workload ...
9111,Shopify,2.0,2023-05-25,stultifying yet high pressure,software engineer,former employee less than 1 year,,very strong pay rate still not enough to negat...,the least interesting work ive ever had combin...
8925,Binance,2.0,2023-12-11,custumer support remote,custumer support,former employee more than 1 year,"São Paulo, São Paulo",remote job paid monthly in crypto or use,no team work no trainings toxic work enviroment
7944,Sabre,3.0,2023-12-31,if they like you its great if not it is unplea...,product marketing manager,former employee,"Southlake, TX",great place to learn many kind coworkers 6 wee...,constant reorganizations constant role changes...
