## Parse resumes and vacancies

In [6]:
import itertools
import numpy as np
from os import listdir

import metrics
import resumes.job_spider_resume_parser as sp
from resumes.job_spider_resume_parser import JobSpiderHTMLResumeParser
from resumes.resume import Resume
from resumes.resume_fetcher import JobSpiderResumeFetcher
from vacancies.job_spider_vacancy_parser import JobSpiderHTMLVacancyParser
from vacancies.vacancy import Vacancy
from vacancies.vacancy_fetcher import JobSpiderVacancyFetcher

from scipy.spatial.distance import cosine


def parse_resumes(n):
    resumes = []
    observed_resumes = listdir('../resumes')[:n]
    for fname in observed_resumes:
        parser = JobSpiderHTMLResumeParser()
        print(fname)
        f = open('../resumes/' + fname)
        parser.feed(f.read())

        resume_field_map = parser.get_resume()
        resume_field_map['id'] = fname
        resumes.append(Resume(resume_field_map))

    return resumes

def parse_vacancies(n):
    vacancies = []
    observed_vacancies = listdir('../vacancies')[:n]
    for fname in observed_vacancies:
        parser = JobSpiderHTMLVacancyParser()
        f = open('../vacancies/' + fname)
        parser.feed(f.read())
        print(fname)

        fmap, body = parser.get_vacancy()
        vacancies.append(Vacancy(fmap, body, fname))
    return vacancies


In [2]:
resumes = parse_resumes(10)
vacancies = parse_vacancies(10)

view-resume-66068.html
view-resume-70761.html
view-resume-53808.html
view-resume-56956.html
view-resume-74436.html
view-resume-57598.html
view-resume-78042.html
view-resume-54520.html
view-resume-78281.html
view-resume-50275.html
view-job-8956466.html
view-job-9108701.html
view-job-9101821.html
view-job-8942049.html
view-job-9071950.html
view-job-9018520.html
view-job-9036906.html
view-job-9101890.html
view-job-8932615.html
view-job-8916486.html


In [8]:
# for r in resumes:
#     print(r.experience)
#     print()    
#     print()

for v in vacancies:
    print(v.body)
    print()    
    print()

['SpiderID: 8956466', 'Number Of Openings: 1', 'Job Description:', 'Technical Hiring Criteria (Must Haves)', 'Mulesoft', 'Java', 'Enterprise Application Integration', 'API Management', 'Minimum years of experience*: 5+', 'Top 3 responsibilities you would expect the Subcon to shoulder and execute*:', 'Provide primary support for Oracles E1 JDE basic HR functions and Finance modules including AP, AR, FA Contract Billing, Advanced Pricing, Invoicing, statements and Regulatory processes. Also would know integrations with Kronos, Archivist, Batesville Online Ordering, Corporate functions, Regulatory, Pricing, SOESOP, Daptiv, Tidal and will have hands on experience', 'Provide primary support for JDE development, SQL .NET development', 'Should provide support in Audit processes', 'Interview Process (Is face to face required?) No', 'Job Criteria:', 'Start Date:', 'Position Type: Contractor', 'Years of Experience Required:', 'Education Required:', 'Overnight Travel:', 'Vacation Time:', 'Contact

## BERT Model

In [9]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM


cls = "[CLS]"
sep = "[SEP]"

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [44]:
class BertEmbeddingsService:
    def __init__(self):
        self.model = bert_model
        self.tokenizer = bert_tokenizer
    
    def create_embeddings(self, text):
        marked_text = cls + " " + text + " " + sep
        tokenized_text = self.tokenizer.tokenize(marked_text)
        #print(tokenized_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        with torch.no_grad():
            encoded_layers, _ = self.model(tokens_tensor, segments_tensors)

        tokens_vecs = encoded_layers[11][0]
        sentence_embedding = torch.mean(tokens_vecs, dim=0)
        return sentence_embedding
    
    def embed_sentences(self, sentences):
        embeddings = [self.create_embeddings(sentence).tolist() for sentence in sentences]
        embeddings = np.array(embeddings)
        return embeddings.mean(0)

    @staticmethod
    def mark(text):
        marked_text = cls
        for sent in text:
            marked_text = marked_text + " " + sent + " " + sep
        return marked_text

In [65]:
model = BertEmbeddingsService()

resume_embeddings_dict = {}
vacancy_embeddings_dict = {}

for resume in resumes:
    embedding = model.embed_sentences(resume.experience)
    resume_embeddings_dict[resume] = embedding

for vacancy in vacancies:
    embedding = model.embed_sentences(vacancy.body)
    vacancy_embeddings_dict[vacancy] = embedding

  ret = ret.dtype.type(ret / rcount)


In [66]:
similarity = {}
for resume, vacancy in itertools.product(resumes, vacancies):
    similarity[(resume, vacancy)] = cosine(resume_embeddings_dict[resume], vacancy_embeddings_dict[vacancy])

In [68]:
for k in similarity:
    print(k[0].id, end=' ')
    print(k[1].id)
    print(similarity[k])

view-resume-66068.html view-job-8956466.html
0.07029148547730368
view-resume-66068.html view-job-9108701.html
0.06366976381558287
view-resume-66068.html view-job-9101821.html
0.09201055326090213
view-resume-66068.html view-job-8942049.html
0.08976254003837547
view-resume-66068.html view-job-9071950.html
0.1574746810526504
view-resume-66068.html view-job-9018520.html
0.10208784682583294
view-resume-66068.html view-job-9036906.html
0.1520569296761516
view-resume-66068.html view-job-9101890.html
0.06558070538832217
view-resume-66068.html view-job-8932615.html
0.07391798949817363
view-resume-66068.html view-job-8916486.html
0.0796641651932054
view-resume-70761.html view-job-8956466.html
0.21984121828522452
view-resume-70761.html view-job-9108701.html
0.1656439557642344
view-resume-70761.html view-job-9101821.html
0.14714860206105396
view-resume-70761.html view-job-8942049.html
0.11956837887411276
view-resume-70761.html view-job-9071950.html
0.22572245296004578
view-resume-70761.html view-j

In [77]:
sim_list = [(k, similarity[k]) for k in similarity]
sim_list.sort(key=lambda x: x[1])

In [79]:
for x in sim_list:
    print(x[0][0].id, end=' ')    
    print(x[0][1].id)
    print(x[1])
    print()

view-resume-53808.html view-job-8916486.html
0.058536989014550844

view-resume-53808.html view-job-8942049.html
0.05879943623924433

view-resume-66068.html view-job-9108701.html
0.06366976381558287

view-resume-66068.html view-job-9101890.html
0.06558070538832217

view-resume-53808.html view-job-8932615.html
0.06916868840923329

view-resume-66068.html view-job-8956466.html
0.07029148547730368

view-resume-53808.html view-job-9101890.html
0.07246285759651139

view-resume-53808.html view-job-9101821.html
0.0735648179752929

view-resume-66068.html view-job-8932615.html
0.07391798949817363

view-resume-53808.html view-job-9108701.html
0.07758214107372774

view-resume-66068.html view-job-8916486.html
0.0796641651932054

view-resume-56956.html view-job-8956466.html
0.08008400621858047

view-resume-56956.html view-job-9101890.html
0.08021489058277731

view-resume-56956.html view-job-9108701.html
0.08486158788422438

view-resume-56956.html view-job-8932615.html
0.08847875129030802

view-resume

## SBERT Model

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [01:02<00:00, 6.51MB/s] 


In [56]:
sentences = ['I have been studying Java for 8 years',
             'We need Java developers']
sentence_embeddings = model.encode(sentences)

In [9]:
sentence_embeddings

[array([ 3.86858940e-01,  3.83189917e-01,  1.57071292e+00, -4.41906720e-01,
         2.67183632e-01,  1.07994890e+00,  3.11838269e-01, -8.90774250e-01,
        -4.30528432e-01, -4.67814714e-01, -9.90933836e-01,  1.12252533e-02,
         2.08902657e-01,  4.34965998e-01,  2.03780219e-01,  2.48768076e-01,
        -4.43256140e-01, -2.60198325e-01,  5.35881341e-01, -5.41760027e-01,
        -7.40462065e-01, -9.25864205e-02, -1.01769125e+00, -5.84856391e-01,
         1.65369287e-02,  1.05871701e+00,  7.12531567e-01,  5.57906330e-01,
        -1.15791023e+00,  4.40011412e-04,  4.19917017e-01, -1.32324362e+00,
         4.67390507e-01,  3.97481412e-01,  2.33892798e-02,  2.03694720e-02,
        -4.29937929e-01, -2.70753890e-01,  1.65901288e-01, -3.07864815e-01,
         2.42891267e-01,  8.08022976e-01,  5.15642226e-01,  7.98140466e-01,
        -6.59407556e-01, -1.55604884e-01, -5.93861878e-01,  7.85587311e-01,
         3.93876761e-01, -1.26261604e+00, -9.16928276e-02, -8.30599725e-01,
        -1.6

In [57]:
1 - cosine(sentence_embeddings[0], sentence_embeddings[1])

0.276216596364975

In [28]:
tensors = [torch.tensor([x]) for x in [[1, 2], [3, 4]]]
tentensors = torch.cat(tensors)
tentensors

tensor([[1, 2],
        [3, 4]])