In [8]:
import re
import string as st
import warnings
warnings.filterwarnings('ignore')

import spacy
import nltk
from spacy.matcher import PhraseMatcher

from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB

nlp = spacy.load("en_core_web_sm")

from nltk.stem.snowball import SnowballStemmer
from nltk import PorterStemmer, WordNetLemmatizer

from nltk.corpus import stopwords

In [9]:
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


## Text cleaning and processing steps-

- Remove punctuations
- Convert text to tokens
- Remove tokens of length less than or equal to 3
- Remove stopwords using NLTK corpus stopwords list to match
- Apply stemming
- Apply lemmatization
- Convert words to feature vectors

In [10]:
# extract skills from job_description
jd = """ Must have at least a bachelor's degree in Computer Science or similar. 
Min. 2 years of relevant work experience. Proficient in Basic Machine Learning concepts: algorithms, evaluation procedures, etc, and 
dealing with Common failure modes. Experienced in at least one area of application (E.g. CV, NLP, etc). 
Has a sound knowledge of mathematical concepts like Linear Algebra, Probability and Statistics, Calculus. 
Proficient in framework & libraries such as Numpy, Pandas, Matplotlib, Scikit-learn and a good grasp of at least one of Tensorflow or Pytorch.
Familiar with Flask, FastAPI or Django, and some domain-specific tools (e.g: opencv, spacy, etc). 
Good Grasp on programming language and concepts such as Python + OOP + SOLID, Data Structures and Algorithms, RESTful APIs, 
and familiar with Architecture Design. Good Grasp of software tools and platforms such as git, conda, pip, jupyter, Docker, 
and at least one cloud platform like AWS/GCP. Good grasp of a database such as SQL/NoSQL. Has a good grasp of 
agile processes like Sprint and Kanban. Good Team Management, Communication, and Problem-Solving Skills
"""

In [16]:
def remove_punct(text):
     # replace multiple white spaces with single white space
    return ("".join([ch for ch in text if ch not in st.punctuation]))
    return text
print(remove_punct(jd))

 Must have at least a bachelors degree in Computer Science or similar 
Min 2 years of relevant work experience Proficient in Basic Machine Learning concepts algorithms evaluation procedures etc and 
dealing with Common failure modes Experienced in at least one area of application Eg CV NLP etc 
Has a sound knowledge of mathematical concepts like Linear Algebra Probability and Statistics Calculus 
Proficient in framework  libraries such as Numpy Pandas Matplotlib Scikitlearn and a good grasp of at least one of Tensorflow or Pytorch
Familiar with Flask FastAPI or Django and some domainspecific tools eg opencv spacy etc 
Good Grasp on programming language and concepts such as Python  OOP  SOLID Data Structures and Algorithms RESTful APIs 
and familiar with Architecture Design Good Grasp of software tools and platforms such as git conda pip jupyter Docker 
and at least one cloud platform like AWSGCP Good grasp of a database such as SQLNoSQL Has a good grasp of 
agile processes like Sprint 

In [22]:
def remove_punct(text):
     # replace multiple white spaces with single white space
    text = re.sub("\n|\\\\n|●|•|○|,|\/|-|–|\(|\)||\\\\", " ", text)
    text = re.sub(" +", " ", text)
    text = re.sub(r'[^\w\s]', '', text)
#     return ("".join([ch for ch in text if ch not in st.punctuation]))
    return text
print(remove_punct(jd))

 Must have at least a bachelors degree in Computer Science or similar Min 2 years of relevant work experience Proficient in Basic Machine Learning concepts algorithms evaluation procedures etc and dealing with Common failure modes Experienced in at least one area of application Eg CV NLP etc  Has a sound knowledge of mathematical concepts like Linear Algebra Probability and Statistics Calculus Proficient in framework  libraries such as Numpy Pandas Matplotlib Scikit learn and a good grasp of at least one of Tensorflow or Pytorch Familiar with Flask FastAPI or Django and some domain specific tools eg opencv spacy etc  Good Grasp on programming language and concepts such as Python  OOP  SOLID Data Structures and Algorithms RESTful APIs and familiar with Architecture Design Good Grasp of software tools and platforms such as git conda pip jupyter Docker and at least one cloud platform like AWS GCP Good grasp of a database such as SQL NoSQL Has a good grasp of agile processes like Sprint an

In [4]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

# Remove tokens of length less than 3
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

# Apply stemming to get root words 
def stemming(text):
    sbs = SnowballStemmer(language='english')
    return [sbs.stem(word) for word in text]

# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

# Create sentences to get clean text as input for vectors
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [5]:
def preprocessing():
    punct_cleaner = remove_punct(jd)
    token_word = tokenize(punct_cleaner)
    remove_word = remove_small_words(token_word)
    stopwords_cleaner = remove_stopwords(remove_word)
#     stemmed_word = stemming(stopwords_cleaner)
    lemma_word = lemmatize(stopwords_cleaner)
    clean_sent = return_sentences(lemma_word)
    return clean_sent

In [6]:
clean_jd = preprocessing()

In [7]:
clean_jd

'must least bachelor degree computer science similar year relevant work experience proficient basic machine learning concept algorithm evaluation procedure dealing common failure mode experienced least area application sound knowledge mathematical concept like linear algebra probability statistic calculus proficient framework library numpy panda matplotlib scikitlearn good grasp least tensorflow pytorch familiar flask fastapi django domainspecific tool opencv spacy good grasp programming language concept python solid data structure algorithm restful apis familiar architecture design good grasp software tool platform conda jupyter docker least cloud platform like awsgcp good grasp database sqlnosql good grasp agile process like sprint kanban good team management communication problemsolving skill'

In [233]:
annotations = skill_extractor.annotate(clean_jd)

In [234]:
# inspect annotations
skill_extractor.describe(annotations)

In [230]:
annotations

{'text': 'must have at least a bachelor s degree in computer science or similar min 2 years of relevant work experience proficient in basic machine learning concepts algorithms evaluation procedures etc and dealing with common failure modes experienced in at least one area of application e g cv nlp etc has a sound knowledge of mathematical concepts like linear algebra probability and statistics calculus proficient in framework & libraries such as numpy pandas matplotlib scikit learn and a good grasp of at least one of tensorflow or pytorch familiar with flask fastapi or django and some domain specific tools e g opencv spacy etc good grasp on programming language and concepts such as python + oop + solid data structures and algorithms restful apis and familiar with architecture design good grasp of software tools and platforms such as git conda pip jupyter docker and at least one cloud platform like aws gcp good grasp of a database such as sql nosql has a good grasp of agile processes l

In [231]:
def extract_hard_soft_skills(job_description):
    ''' 
       Extract hard skills, soft skills and certification from given job description 
       using pre-trained model SkillNER
    '''
    annotations = skill_extractor.annotate(job_description)
    list_words = annotations["text"].split(" ")
    hard_soft_skills = {'Hard Skill' : [],
                   'Soft Skill' : [],
                   'Certification': []}
    for type_matching, arr_skills in annotations["results"].items():
        for skill in arr_skills:
            # index word start and end
            start = skill["doc_node_id"][0]
            end = skill["doc_node_id"][-1]

             # meta data
            skill_id = skill["skill_id"]
#             skill_name = SKILL_DB[skill["skill_id"]]["skill_name"]
            skill_name = skill["doc_node_value"]
            skill_type = SKILL_DB[skill["skill_id"]]["skill_type"]
            hard_soft_skills[skill_type].append(skill_name)
            
    # extract unique hard and soft skills
    hard_soft_skills['Hard Skill'], hard_soft_skills['Soft Skill'] = list(set(hard_soft_skills['Hard Skill'])), list(set(hard_soft_skills['Soft Skill']))
    return hard_soft_skills

In [232]:
extract_hard_soft_skills(clean_jd)

{'Hard Skill': ['python',
  'tool platform',
  'computer science',
  'jupyter',
  'calculus',
  'agile process',
  'programming language',
  'restful apis',
  'linear algebra',
  'opencv',
  'data structure',
  'library',
  'tool',
  'machine learning',
  'probability statistic',
  'docker',
  'matplotlib',
  'flask',
  'django',
  'numpy',
  'architecture design',
  'tensorflow',
  'pytorch',
  'cloud platform',
  'spacy'],
 'Soft Skill': ['team management'],
 'Certification': []}