# SPACY

https://spacy.io/usage

Install:

`conda install -c conda-forge spacy`

`python -m spacy download en_core_web_sm`


In [None]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [14]:
w1 = 'rainy'
w2 = 'sunny'

w1 = nlp.vocab[w1]
w2 = nlp.vocab[w2]

In [15]:
w1.similarity(w2)

0.7039623260498047

In [16]:
s1 = nlp("I belive in the god and the bible")
s2 = nlp("I trust in a higher power of Christianity")
s3 = nlp("This weekend John will drink a beer")

In [17]:
s1.similarity(s2)

0.8592989444732666

In [18]:
s1.similarity(s3)

0.70660001039505

## Using verb or adjective

In [19]:
s1 = nlp('I play football in this awful arena.')
s2 = nlp('I play the piano in this red room.')
s3 = nlp('I repair the piano in this ugly room.')

In [24]:
s1_verbs = " ".join([token.lemma_ for token in s1 if token.pos_ == "VERB"])
s1_nouns = " ".join([token.lemma_ for token in s1 if token.pos_ == "NOUN"])
s1_adjs = " ".join([token.lemma_ for token in s1 if token.pos_ == "ADJ"])

s2_verbs = " ".join([token.lemma_ for token in s2 if token.pos_ == "VERB"])
s2_nouns = " ".join([token.lemma_ for token in s2 if token.pos_ == "NOUN"])
s2_adjs = " ".join([token.lemma_ for token in s2 if token.pos_ == "ADJ"])

s3_verbs = " ".join([token.lemma_ for token in s3 if token.pos_ == "VERB"])
s3_nouns = " ".join([token.lemma_ for token in s3 if token.pos_ == "NOUN"])
s3_adjs = " ".join([token.lemma_ for token in s3 if token.pos_ == "ADJ"])

In [26]:
print(f'{s1}and {s2} VERBS: {nlp(s1_verbs).similarity(nlp(s2_verbs))}')
print(f'{s1}and {s3} VERBS: {nlp(s1_verbs).similarity(nlp(s3_verbs))}')

I play football in this awful arena.and I play the piano in this red room. VERBS: 1.0
I play football in this awful arena.and I repair the piano in this ugly room. VERBS: 0.16859392821788788


## Match the whole text for example skills

In [52]:
import os 
import json
from collections import defaultdict
import PyPDF2
#import pandas as pd 
#import numpy as np
import matplotlib.pyplot as plt
import spacy 
from spacy.tokens import DocBin
from tqdm import tqdm
import re
from tika import parser
import warnings
warnings.filterwarnings("ignore")

### Resume

In [59]:
def pdf_to_text(pdffile):
    """Read PDF file ans=d parse as text"""
    pdf = PyPDF2.PdfReader(pdffile)
    text = ""

    for i in range(len(pdf.pages)):
        pageObj = pdf.pages[i]
        text += pageObj.extract_text()
    return text

def preprocess(text):
  text = "".join([s for s in text.splitlines(True) if s.strip("\r\n")])
  # text = re.sub('[^A-Za-z0-9\n]+', ' ', text)
  return text

sub_directory_for_resume = 'CV'
files_list_resume = os.listdir('CV')


### Job Description

In [60]:
def read_jd_file(filename):
    """Program to read the entire file (absolute path) using read() function"""
    file = open(filename, "r")
    content = file.read()
    file.close()
    return content

sub_directory_for_jd = 'JD'
files_list_jd = os.listdir('JD')

In [63]:
resume_file_paths = [os.path.join(sub_directory_for_resume, file) for file in files_list_resume]
resume = pdf_to_text(resume_file_paths[0])
resume = preprocess(text)

jd_file_paths = [os.path.join(sub_directory_for_jd, file) for file in files_list_jd]
jd_text = read_jd_file(jd_file_paths[1])
jd_text = preprocess(jd_text)


### Load Model for NER

This model has to be fine tune using GPU training again with resume/jobdescription dataset


In [62]:
nlp = spacy.load('./model')

### Extract NER labels from the models

In [64]:
def extract_ner(text, nlp):
    '''This function get the conntent of resume and label them '''
    ner_labels = []
    for doc in nlp.pipe([text], disable=["tagger", "parser"]):
      for ent in doc.ents:
          text_name = re.sub('[^A-Za-z0-9]+', ' ', ent.text).strip()
          ner_labels.append((text_name, ent.label_))
    return ner_labels

ner_labels_resume = extract_ner(resume, nlp)
ner_labels_jd = extract_ner(jd_text, nlp)

In [66]:
def to_json(ner_labels):
    # Create a defaultdict to group items by label
    grouped_data = defaultdict(list)
    
    # Group the items by their labels
    for item, label in ner_labels:
        grouped_data[label].append(item)
    
    # Convert the defaultdict to a regular dictionary
    json_data = dict(grouped_data)
    
    # Convert the dictionary to a JSON string
    json_string = json.dumps(json_data, indent=2)
    return json_string

resume_json = to_json(ner_labels_resume)
print(resume_json)

{
  "EXPERIENCE": [
    "6 years of experience as Oracle Pl Sql D eveloper and have experience in creating complex database objects like Stored Procedures Functions Packages and Triggers using SQL and"
  ],
  "TOOL": [
    "PL SQL",
    "Associative arrays",
    "Nested tables",
    "Varrays",
    "Cursors",
    "SQL",
    "PL SQL",
    "DBMS JOB",
    "UTL FILE",
    "SQL",
    "PL SQL",
    "HINTS",
    "Oracle",
    "Export",
    "EXPDP",
    "OLTP",
    "OLAP",
    "SQL",
    "PL SQL",
    "T SQL",
    "UNIX Shell Script",
    "HTML",
    "PHP Java",
    "Oracle 11g 12C",
    "MS SQL SERVER",
    "TOAD",
    "SQL Developer",
    "ADDM",
    "AWR",
    "Github",
    "Windows XP 10",
    "Linux",
    "Oracle PL SQL Developer Project Smartdata",
    "PL SQL",
    "sql",
    "SQL Loader",
    "UNIX scripts",
    "Oracle database tables",
    "Oracle Data Pump",
    "Oracle database",
    "PL SQL",
    "BitBucket",
    "JIRA",
    "Confluence",
    "SourceTree",
    "Bamboo",
    "PL SQ

In [67]:
jd_json = to_json(ner_labels_jd)
print(jd_json)

{
  "TOOL": [
    "Oracle DBA",
    "Oracle EBS",
    "AP FA GL and Coupa",
    "Oracle database",
    "EBS R12 2 x objects",
    "Oracle team",
    "Oracle SRS",
    "SQL",
    "Pl sql",
    "Putty",
    "WinSCP",
    "Joms"
  ],
  "EXPERIENCE": [
    "6 months",
    "Previous banking knowledge"
  ]
}


In [68]:
def extract_skills(ner_results, label):
    '''This function extracts skills from NER results
    label can be SKILL, EXPERIENCE, TOOL ...'''
    
    results = [item[0] for item in ner_results if item[1] == label]
    return results

# Extract from jd
skills_jd = extract_skills(ner_labels_jd, 'SKILL')
experience_jd = extract_skills(ner_labels_jd, 'EXPERIENCE')
tool_jd = extract_skills(ner_labels_jd, 'TOOL')

# Extract from resume
skills_cv = extract_skills(ner_labels_resume, 'SKILL')
experience_cv = extract_skills(ner_labels_resume, 'EXPERIENCE')
tool_cv = extract_skills(ner_labels_resume, 'TOOL')


In [71]:
def get_score(cv_subset, jd_subset):
    #  join the skills in each list into a single string
    resume_text = ' '.join(cv_subset)
    jd_text = ' '.join(jd_subset)
    
    #create Doc objects for each
    doc_resume = nlp(resume_text)
    doc_jd = nlp(jd_text)
    
    #compute the similarity
    similarity_score = doc_resume.similarity(doc_jd)
    return similarity_score

skill_score = get_score(skills_cv, skills_jd)
tool_score = get_score(tool_cv, tool_jd)
exp_score = get_score(experience_cv, experience_jd)

print(f"Similarity score \n: Skill: {skill_score} \t Tool: {tool_score} \t Experience: {exp_score}")

Similarity score 
: Skill: 0.0 	 Tool: 0.0 	 Experience: 0.0


In [74]:
test_skills_1 = """Business Requirements Analysis, Systems Design, Prototype Development, Impact Analysis, Design Specifications Creation"""
test_skills = """Possess the ability to assess business requirements, perform impact analysis against the existing system and provide analysis on integrating business change.
Demonstrated ability to create Design Specifications based on System Use Cases, User Stories, Business Use Cases, and/or Requirements Documents.
Translate business requirements into systems design and technical specifications.
Work with developers to model and produce functional prototype and an operational system including all forms, manuals programs, data files and procedures.
Provide expert consultation in production system analysis, performance, scalability security and maintenance.
Define input/output sources including a detailed plan for technical design phase.
Participates in sprint and cycle planning with the agile team.
Validates proposed solution for alignment to business needs, requirements, and impacts to operations, processes, technology, vendors, partners and clients."""

print(get_score(test_skills_1, test_skills))


0.0
