In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import random
import torch

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

### Knowledge Base: Data Pre-processing

In [2]:
df = pd.read_csv('HR FAQ Knowledge Base Sample.csv')

In [3]:
df.head()

Unnamed: 0,question,answer
0,How many days of medical leave do I have?,All staff are entitled to 14 days of medical l...
1,How many days of medical leave can I apply?,All staff are entitled to 14 days of medical l...
2,Is there a limit to the number of medical leav...,All staff are entitled to 14 days of medical l...
3,Can I claim medical expenses from a TCM doctor?,Please note that Traditional Chinese Medicine ...
4,I visited TCM doctor recently. Can medical fee...,Please note that Traditional Chinese Medicine ...


In [4]:
# Normalise the question to lower case and remove special characters
df['question2'] = df['question'].str.lower().apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', ' ', x)) 

In [5]:
df.head()

Unnamed: 0,question,answer,question2
0,How many days of medical leave do I have?,All staff are entitled to 14 days of medical l...,how many days of medical leave do i have
1,How many days of medical leave can I apply?,All staff are entitled to 14 days of medical l...,how many days of medical leave can i apply
2,Is there a limit to the number of medical leav...,All staff are entitled to 14 days of medical l...,is there a limit to the number of medical leav...
3,Can I claim medical expenses from a TCM doctor?,Please note that Traditional Chinese Medicine ...,can i claim medical expenses from a tcm doctor
4,I visited TCM doctor recently. Can medical fee...,Please note that Traditional Chinese Medicine ...,i visited tcm doctor recently can medical fee...


In [6]:
# Export processed knowledge base
df.to_csv('cleaned_df.csv', index=False)

### Model Development: Distilroberta

In [7]:
# Initiate model
model = SentenceTransformer('stsb-distilroberta-base-v2')

You try to use a model that was created with version 1.1.0, however, your version is 0.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [8]:
# Encode the questions to get embeddings
qn_embeddings = model.encode(df['question2'].to_list(), convert_to_tensor=True)

In [9]:
# Extract the word embeddings into pickle
pickle.dump(qn_embeddings, open('distilroberta-embedding.pkl', 'wb'))

In [12]:
def hr_response(req):
       
    # Pre-processing
    req = re.sub(r'[^A-Za-z0-9\s]', ' ', req.lower())
    
    # Encode the text to get embeddings
    model = SentenceTransformer('stsb-distilroberta-base-v2')
    req_embeddings = model.encode(req).reshape(1, -1)

    # Compute similarity
    qn_embeddings = pickle.load(open('distilroberta-embedding.pkl', 'rb'))
    cosine_sim = cosine_similarity(qn_embeddings, req_embeddings) 
    cosine_sim = [(idx, item) for idx,item in enumerate(cosine_sim)]
    sim_scores = sorted(cosine_sim, key=lambda x: x[1], reverse=True) 
    
    # Return response of the top most similar question
    top_score = sim_scores[0]
    qn_indice = top_score[0]
    
    df = pd.read_csv('cleaned_df.csv')
    return df['answer'].iloc[qn_indice]

In [13]:
hr_response('Can I claim medical expenses from a TCM doctor?')

You try to use a model that was created with version 1.1.0, however, your version is 0.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





'Please note that Traditional Chinese Medicine (TCM) treatments are NOT claimable. '