## Indexing classes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time

In [2]:
df_raw = pd.read_csv("data/new_sg_companies_reviews_UID.csv")
df = pd.read_csv("data/final_sg_companies_reviews_clean_UID.csv")

In [4]:
from retrievers import BaseRetriever
from enums import EncoderType, IndexerType
# encoder_type = EncoderType("TermFrequencyEncoder")
encoder_type = EncoderType("SentenceTransformerEncoder")
# indexer_type = IndexerType("LeaderIndexer")
indexer_type = IndexerType("SimpleIndexer")

encoder_kwargs={}
# indexer_kwargs={"n_clusters": 1, 'use_pca': False, 'min_explained_var': 0.7}
indexer_kwargs = {}

df_concat = df.copy()
df_concat['Job Title'] = df_concat['Job Title'].fillna("")
df_concat['overall'] = df_concat.apply(lambda x: " ".join([x['Company Name'], x['Job Title'], x['Review Title'], x['Pros'], x['Cons']]), axis = 1)

start_indexing = time.time()
# review_retriever = BaseRetriever(encoder_type, indexer_type, df, 'Review Title', encoder_kwargs=encoder_kwargs, indexer_kwargs=indexer_kwargs)
all_retriever = BaseRetriever(encoder_type, indexer_type, df_concat, 'overall', encoder_kwargs=encoder_kwargs, indexer_kwargs=indexer_kwargs)
end_indexing = time.time()
print(f"time taken for indexing: {end_indexing - start_indexing}s")


time taken for indexing: 617.1850764751434s


#### Retrieval results

In [8]:
test_query = "I want to see good MONEY, CULTURE AND WORK LIFE BALANCE"
results = all_retriever.retrieve_results(test_query)
result_df = pd.DataFrame(results, columns = ['id', 'score'])
display(pd.merge(df_raw, result_df, on = 'id').sort_values('score', ascending = False))

Unnamed: 0,id,Company Name,Overall Rating,Review Date,Review Title,Job Title,Job Details,Location,Pros,Cons,score
2271,2024-02-15 00:00:00Good place to work2795,PayPal,5.0,2024-02-15,Good place to work,Product Lead,Current Employee,,Good culture and good people,Several org changes in recent years,0.595791
12734,2024-02-07 00:00:00Good work life balance14356,World Wide Technology,4.0,2024-02-07,Good work life balance,Senior Consultant,Current Employee,"New York, NY","Good work life balance. Great culture, great team",Lack of pay range transparency,0.585597
2254,2024-02-13 00:00:00Good2765,DXC Technology,3.0,2024-02-13,Good,Associate,Current Employee,Bengaluru,"Work Life balance, good culture","Limited pay, less holidays, work pressure",0.580260
772,2024-02-13 00:00:00Great company to work for957,Infosys,5.0,2024-02-13,Great company to work for,Front End Developer,"Former Employee, less than 1 year",Pune,Work life balance is amazing. Good pay,Culture should be more appreciative.,0.579926
4996,"2024-01-26 00:00:00Good company, with great cu...",Thoughtworks,4.0,2024-01-26,"Good company, with great culture and work life...",Senior Consultant,"Current Employee, more than 1 year",Pune,"Culture , work life balance","Payscale , service based company",0.578006
...,...,...,...,...,...,...,...,...,...,...,...
10610,2023-03-17 00:00:00Phenomenal12215,Razer,3.0,2023-03-17,Phenomenal,Tech Support Specialist,"Former Employee, more than 3 years","San Diego, CA",Gaming culture through and through,Rumor has it someone was chocked out by an exe...,-0.028070
3065,2023-08-04 00:00:00Got fired for clocking out ...,ByteDance,1.0,2023-08-04,Got fired for clocking out on my app when I left,Food Runner,"Current Employee, less than 1 year","Mountain View, CA",It was a nice feeling there. Lay back. I reall...,I got fired because I work for an app I clock ...,-0.028434
11783,2023-05-15 00:00:00They told me not to comment...,Unity,3.0,2023-05-15,They told me not to comment on glassdoor,Doesnt Matter,Former Employee,,I want you to know that my separation papers i...,See pros this is really not good.,-0.040518
10359,2024-02-20 00:00:00V4 engine car is a must11961,Uber,4.0,2024-02-20,V4 engine car is a must,Delivery Driver,"Current Employee, more than 1 year","Seattle, WA",Full control of your schedule. Some customers ...,Some trips take longer than expected. Some add...,-0.051256


In [9]:
q1 = "I want to see good MONEY, CULTURE AND WORK LIFE BALANCE"
q2 = "career progression???"
q3 = "the worst companies to work at"
q4 = "tiktok engineer work-life balance"
q5 = "low salary and terrible management, with toxic culture"

res = []
for q in [q1,q2,q3,q4,q5]:
    start_retrieving = time.time()
    results = all_retriever.retrieve_results(q)
    end_retrieving = time.time()
    print(f"time taken for retrieval: {round(end_retrieving - start_retrieving, 5)}s")
    result_df = pd.DataFrame(results, columns = ['id', 'score'])
    res.append(pd.merge(df_raw, result_df, on = 'id').sort_values('score', ascending = False).iloc[0])

time taken for retrieval: 0.25138s
time taken for retrieval: 0.20181s
time taken for retrieval: 0.25703s
time taken for retrieval: 0.27838s
time taken for retrieval: 0.29939s


In [10]:
for r in res:
    print(f"Company Name: {r['Company Name']}\n\nReview Title: {r['Review Title']}\n\nJob Title: {r['Job Title']}\n\nPros: {r['Pros']}\n\nCons: {r['Cons']}\n=========================")

Company Name: PayPal

Review Title: Good place to work

Job Title: Product Lead

Pros: Good culture and good people

Cons: Several org changes in recent years
Company Name: ServiceNow

Review Title: Good work life balance

Job Title: Software Engineer

Pros: good work life balance, decent compensation

Cons: out dated tech stack, slow career progression
Company Name: Mphasis

Review Title: Worse Company Ever to Work For

Job Title: Customer Account Specialist

Pros: There are no words to describe any decency at this company

Cons: There are no words to describe how horrible this company is to work for. Treat their employees like trash and speak to them demeaningly.
Company Name: TikTok

Review Title: No work life balance

Job Title: Software Development Engineer In Test (SDET)

Pros: Compensation is fair and is as per the market standards.

Cons: There is no work life balance. If feels as if your entire day is to work for the company only, with regular offshore calls.
Company Name: Syn