# Rule based Scoring using the NER model trained using Spacy

In [1]:
import numpy as np
import pandas as pd
import os
import spacy
from tika import parser
import re

In [5]:
ner_model = spacy.load(os.path.join(os.path.dirname(os.getcwd()),"Training_NER/saved-NER.model"))

## Taking a test resume and the JD of Borneo for testing out the scoring

In [6]:
parser_jd = parser.from_file("borneo-JD.txt")
parser_resume = parser.from_file(os.path.join(os.path.dirname(os.getcwd()),
"Training_NER/dataset/test/chenna kesava.docx"))

jd = parser_jd["content"]
resume = parser_resume["content"]

# Resume Scoring routine

In [7]:
def preprocess(doc):
    doc = doc.replace("\n", " ")
    doc = doc.replace("•","")
#     doc = doc.replace("")
    doc = doc.replace("–","")
    doc = doc.replace("\t"," ")
    doc = doc.strip()
    return doc

jd = preprocess(jd)
resume = preprocess(resume)

In [8]:
deg_score = 0

for word in resume.split(" "):
    if word.strip() in ["PhD","PHD","Research Associate"]:
        deg_score=3
    elif word.strip() in ["MS","MT","M.Tech","Masters"]:
        if deg_score<2:
            deg_score=2
    elif word.strip() in ["BS","BE","B.S","B.E","B.Tech","Bachelors"]:
        if deg_score<1:
            deg_score=1

# print(deg_score)        

In [9]:
des_score = 0

for word in resume.split(" "):
    if word.strip() in ["Sr.","Senior"]:
        if des_score<3:
            des_score=3
    elif word.strip() in ["Associate", "Scientist", "Engineer"]:
        if des_score<2:
            des_score=2
    elif word.strip() in ["Analyst", "Junior"]:
        if des_score<1:
            des_score=1

# print(des_score)

In [10]:
exp_score = 0
a = re.findall(r'[0-9]+\+*[ ]?[Yy]ear',resume)
a.sort()

if len(a)>0:
    exp = a[len(a)-1].lower().split("y")[0].strip()
#     print(exp)

    if "+" in exp :
        exp = exp[:-1]
    
    exp = int(exp)
#     print(exp)
    if exp>=4:
        exp_score=3
    elif exp>=2:
        exp_score=2
    elif exp==1:
        exp_score=1

## Cosine Document Similarity for comparing resume and skills with the JD

In [11]:
import math
import string
import sys
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


# splitting the text lines into words
# translation table is a global variable
# mapping upper case to lower case and
# punctuation to spaces
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase,
									" "*len(string.punctuation)+string.ascii_lowercase)
	
# returns a list of the words
# in the file
def get_words_from_line_list(text):
	
	text = text.translate(translation_table)
	word_list = [x for x in text.split() if x not in set(stopwords.words('english'))]
	
	return word_list


# counts frequency of each word
# returns a dictionary which maps
# the words to their frequency.
def count_frequency(word_list):
	
	D = {}
	
	for new_word in word_list:
		
		if new_word in D:
			D[new_word] = D[new_word] + 1
			
		else:
			D[new_word] = 1
			
	return D

# returns dictionary of (word, frequency)
# pairs from the previous dictionary.
def word_frequencies_for_text(text):
	
	line_list = text
	word_list = get_words_from_line_list(line_list)
	freq_mapping = count_frequency(word_list)

# 	print("File", filename, ":", )
# 	print(len(line_list), "lines, ", )
# 	print(len(word_list), "words, ", )
# 	print(len(freq_mapping), "distinct words")

	return freq_mapping


# returns the dot product of two documents
def dotProduct(D1, D2):
	Sum = 0.0
	
	for key in D1:
		
		if key in D2:
			Sum += (D1[key] * D2[key])
			
	return Sum

# returns the angle in radians
# between document vectors
def vector_angle(D1, D2):
	numerator = dotProduct(D1, D2)
	denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
	
	return math.acos(numerator / denominator)


def documentSimilarity(text_1, text_2):
    sorted_word_list_1 = word_frequencies_for_text(text_1)
    sorted_word_list_2 = word_frequencies_for_text(text_2)
    distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
    return math.degrees(distance)
	
# Driver code
# documentSimilarity(jd, resume)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhinaykumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
doc = ner_model(resume)

skill_list = [tok.text for tok in doc if tok.ent_type_=="Skills"]
skill_text = " ".join(skill_list)

skill_score = 0

if len(skill_list)>0:
    skill_match = 90.0-documentSimilarity(jd,skill_text)
    ## Skills are matched on a scale of 0-10
    skill_score = min(10,skill_match)


## Overall Resume Match

In [13]:
resume_match = 90-documentSimilarity(jd, resume)
resume_score = min(20,resume_match)

# Resume Score (on scale 1 to 10)

In [14]:
score = round(10/7*(deg_score*0.20+des_score*0.20+exp_score*0.20+skill_score*0.30+resume_score*0.10),1)
print(score)

2.8
