In [None]:
#module installation
!pip install sentence_transformers

#import libraries
import csv
from datetime import datetime, timedelta
from sentence_transformers import SentenceTransformer, util



In [None]:
#data extraction
#
#mocked talent pool with the following structure:
#
#   [[UID], [First Name], [Last Name], [Location], [Available from],
#    [Role], [Years of professional experience], [Skills], [Social links],
#    [Short biography], [3x personality questions], [Tools], [Languages],
#    [Time zone], [Working hours], [Work experience (Title; Company; Years; Industry)],
#    [Projects], [Hourly rate($)]]
#
talent_pool = []

with open('applicant.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)

    #skip the first row which is header
    next(csv_reader)

    for row in csv_reader:
      lst = []

      for col in row:
        if col == row[7] or col == row[11] or col == row[12]:
          if ',' in col:
            items = [item.strip() for item in col.split(',')]
            lst.append(items)
            continue
        if col == row[15]:
          if ';' in col:
            items = [item.strip() for item in col.split(';')]
            lst.append(items)
            continue
        lst.append([col])
      talent_pool.append(lst)
################################################################################
#mocked roles pool with the following structure:
#
#   [[Project name], [Project ID], [Role name], [Role description], [Hourly rate],
#    [Required skills], [Preferred skills], [Tools], [Available from], [Time zone],
#    [Working hours], [Minimum available hours per week], [Location], [Industry],
#    [Years of experience], [Language]]
role_pool = []

with open('role.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)

    #skip the first row which is header
    next(csv_reader)

    for row in csv_reader:
      lst = []

      for col in row:
        if col == row[5] or col == row[6] or col == row[7]:
          if ',' in col:
            items = [item.strip() for item in col.split(',')]
            lst.append(items)
            continue
        if col == row[12]:
          if ';' in col:
            items = [item.strip() for item in col.split(';')]
            lst.append(items)
            continue
        lst.append([col])
      role_pool.append(lst)

print([talent_pool[812]])
print(role_pool[0])

[[['d561da9d-f78b-4dde-a107-acebfae279f8'], ['Nicole'], ['Rivera'], ['Northern Territory, Australia'], ['2023-09-01'], ['Risk & Compliance Consultant'], ['10'], ['Fundraising', 'Brand Strategy', 'Android', 'Backlog Prioritization', '3D Engineering', 'API Design'], ['https://www.turner.info/'], ['I am a IT professional with expertise in Backlog Prioritization, Android. My previous role was as a Front-End Engineer at Cortez, Shaw and Murphy in the IT industry where I gained 10 years of experience. Skills: API Design, Fundraising, Brand Strategy'], ['What would you like to be known/remembered for?, What’s your proudest professional achievement?, What’s your proudest professional achievement?'], ['Gatsby', 'Emscripten', 'Flinto'], ['Afrikaans', 'Assamese'], ['GMT+10'], ['09:30 - 17:30'], ['Social worker', 'Rivera, Rodriguez and Lee', '5', 'IT'], ['Group'], ['24']]]
[['Ltd'], ['66367'], ['Technical Project Manager'], ["This role at PLC involves Technical Project Manager-related responsibili

In [None]:
#hard filtering algorithm for a specific role
#it removes a talent from the talent pool if:
#   - the talent does not possess any required skills
#   - the talent is not in the same location as required by the role
#   - the talent cannot speak any required languages
#
#input parameters: talent pool of the role, the information of the role
#return: the hard filtered talent pool
def hardFiltering(talent_pool, role):
  hf_talent_pool = []

  for talent in talent_pool:
    #compare talent's skills with the required skills
    if (set(talent[7]) & set(role[5])):
      #compare the location
      if (set(talent[3]) & set(role[12])):
        #compare the language
        if (set(talent[12]) & set(role[15])):
          hf_talent_pool.append(talent)
  return hf_talent_pool

In [None]:
#soft filtering algorithm for a specific role
#it evaluates the remaining talents from the hard filtered talent pool
#and calculates a final score for each talent based on the weight of attributes.
#
#the weight of each attribute is shown below:
#   - required skill : 1
#   - availability(available from): 1
#   - location: 1
#   - minimum available hours per week: 1
#   - overlap hours: 1
#   - language: 1
#   - hourly rate: 0.75
#   - years of experience: 0.75
#   - preferred skill: 0.5
#   - tool: 0.5
#   - industry: 0.5
#   - text information(i.e., "About Me", "Role Description"): 0.25
#
#input parameters: hard filtered talent pool of the role, the information of the role
#return: best talent from the talent pool
def softFiltering(talent_pool, role):

  talent_score_dict = {}
  #NLP model used for semantic similarity comparison between texts
  model = SentenceTransformer('paraphrase-distilroberta-base-v1')

  for talent in talent_pool:
    score = 0

    #calculate score for required skill attribute and add to the final score,
    #the more required skills applicants have, the higher score they get
    score += (len(set(talent[7]) & set(role[5])) / len(set(role[5]))) * 1.0

    #check availability. If the applicant's available date is before or
    #the same as the role's, then the applicant will get the score
    if talent[4][0] <= role[8][0]:
      score += 1.0

    # ***calculate the score for location attribute(no need to calculate
    # at this point as it is checked in the hard filtering)***

    #calculate the score for minimum available hours per week

    #calculate score for overlap hours between the applicant and the role.
    #More overlaping hours will result in higher score
    working_hours = periodToHours(role[10][0])
    overlap_hours = overlapHours(talent[13][0], talent[14][0], role[9][0], role[10][0])
    score += (overlap_hours / working_hours) * 1.0

    # ***calculate score for language attribute(no need to calculate
    # at this point as it is checked in the hard filtering)***

    #calculate score for hourly rate($) attribute. If the applicant's
    #hourly rate is cheaper than or equal to the role's, then the
    #cheaper the applicant's hourly rate, the higher score they get
    if float(talent[17][0]) <= float(role[4][0]):
      score += ((float(role[4][0]) - float(talent[17][0])) / float(role[4][0])) * 0.75

    #calculate score for years of experience attribute. If the applicant's
    #years of experience is higher than or equal to the role's requirement,
    #then the higher the applicant's years of experience, the higher the score
    if float(talent[6][0]) >= float(role[14][0]):
      score += ((float(talent[6][0]) - float(role[14][0])) / float(talent[6][0])) * 0.75

    #calculate score for preferred skills attribute. The more preferred skills
    #the applicant has, the higher the score
    score += (len(set(talent[7]) & set(role[6])) / len(set(role[6]))) * 0.5

    #calculate score for tool attribute. The more required tools the applicant
    #has, the higher the score
    score += (len(set(talent[11]) & set(role[7])) / len(set(role[7]))) * 0.5

    #calculate score for industry attribute. If the applicant has worked
    #in the same industry as the role's before, they get the score
    if talent[15][3] == role[13][0]:
      score += 0.5

    #calculate the score for text information(i.e., "About Me", "Role Description").
    #The higher the similarity score between the applicant's "Short Biography" and
    #the role's "Role Description", the higher score they will get
    score += refinedFinalSimilarity(talent[9][0], role[3][0], model) * 0.25

    #add the talent UID and the final score into the dictionary
    talent_score_dict[talent[0][0]]= score

  #get the talent with the highest score
  best_talent = max(talent_score_dict, key=talent_score_dict.get)
  return best_talent, talent_score_dict.get(best_talent)

#method to calculate the overlap hours between a and b
#input parameters: timezone of a, working hours of a,
#          timezone of b, working hours of b
#return: overlap working hours between a and b
def overlapHours(a_timezone, a_workhours, b_timezone, b_workhours):
  #extract timezone offsets
  a_offset = int(a_timezone[3:])
  b_offset = int(b_timezone[3:])

  #calculate the difference(in hours) between these two time zones
  time_diff = b_offset - a_offset

  #convert working hours to minutes from midnight
  a_start, a_end = map(timeToMinutes, a_workhours.split(' - '))
  b_start, b_end = map(timeToMinutes, b_workhours.split(' - '))

  #adjust b's working hours to a's timezone
  b_start -= time_diff * 60
  b_end -= time_diff * 60

  #calculate the overlap time between a and b
  overlap_start = max(a_start, b_start)
  overlap_end = min(a_end, b_end)

  #avoid negative overlap time(i.e., min. overlap time should be 0)
  if overlap_start >= overlap_end:
    overlap_time = 0
  else:
    overlap_time = overlap_end - overlap_start

  #convert overlap time back to hours
  overlap_hours = overlap_time / 60.0

  return overlap_hours

#method to convert a HH:MM style time string to minutes
#input parameter: time string in HH:MM style
#return: converted time in minutes from midnight
def timeToMinutes(time_str):
  hours, minutes = map(int, time_str.split(':'))
  return hours * 60 + minutes

#method to calculate the number of working hours for a
#HH:MM - HH:MM style time period string
#input parameter: time period string
#return: number of hours in the period
def periodToHours(time_period):
  #split the string into start and end time
  start_time, end_time = time_period.split(' - ')

  #convert start and end time to minutes
  start_minutes = timeToMinutes(start_time)
  end_minutes = timeToMinutes(end_time)

  #calculate the difference and convert it back to hours
  diff_minutes = end_minutes - start_minutes
  diff_hours = diff_minutes / 60.0

  return diff_hours

#method to compute the cosine similarity between two texts
#input parameters: string of text1, string of text2, NLP model
#return: the cosine similarity between text1 and text2
def computeSimilarity(text1, text2, model):
  #get embeddings for the texts
  emb1 = model.encode(text1, convert_to_tensor=True)
  emb2 = model.encode(text2, convert_to_tensor=True)

  #compute and return cosine similarity
  return util.pytorch_cos_sim(emb1, emb2).item()

#method to compute the semantic similarity between two texts. This
#method breaks down the two texts into sentences and compare their
#similarities, it aggregates these scores and produce a final similarity
#score between the two texts
#
#input parameters: string of text1, string of text2, NLP model
#return: final similarity score between text1 and text2
def computeFinalSimilarity(text1, text2, model):
  #decompose/seperate text into sentences
  text1_sentences = text1.split('.')
  text2_sentences = text2.split('.')

  #only consider non-empty sentences after the split
  text1_sentences = [sent.strip() for sent in text1_sentences if sent.strip()]
  text2_sentences = [sent.strip() for sent in text2_sentences if sent.strip()]

  final_similarity = 0

  #compare each text1 sentence to all text2 sentences and sum the maximum similarity
  for text1_sent in text1_sentences:
    max_similarity_for_sent = max([computeSimilarity(text1_sent, text2_sent, model) for text2_sent in text2_sentences])
    final_similarity += max_similarity_for_sent

  #return the final similarity score. (normalizing by the number of sentences in text1)
  return final_similarity / len(text1_sentences)

#refine the final score to make it consistent between (text1, text2) and (text2, text1)
#input parameters: string of text1, string of text2, NLP model
#return: refined final similarity score between text1 and text2
def refinedFinalSimilarity(text1, text2, model):
  score_1 = computeFinalSimilarity(text1, text2, model)
  score_2 = computeFinalSimilarity(text2, text1, model)
  return (score_1 + score_2) / 2

In [None]:
#test
talent_pool = hardFiltering(talent_pool, role_pool[0])
best_talent, score = softFiltering(talent_pool, role_pool[0])
print("The most suitable talent(UID) for this role is:", best_talent, " . The score is:", score)

Downloading (…)7f4ef/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f279f7f4ef/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)79f7f4ef/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)279f7f4ef/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7f4ef/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)279f7f4ef/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)9f7f4ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

The most suitable talent(UID) for this role is: d561da9d-f78b-4dde-a107-acebfae279f8  . The score is: 4.685992125169912
