## Fetch resumes from job spiders searches
We are going to query different key words each corresponding to a certain resume label (industry domain)
And fetch those resumes into `./resumes/<label_name>`

In [11]:
import itertools
import numpy as np
import os

from resumes.resume import Resume
from resumes.resume_fetcher import JobSpiderResumeFetcher

labels = ['software', 'sales', 'nurse']

def fetch():
    for label in labels:
        fetcher = JobSpiderResumeFetcher(label, '..')
        fetcher.fetch_resumes_page(1)
        fetcher.fetch_resume_files()

In [6]:
fetch()

downloaded resume page: 1
resume urls retrieved
connecting with http://www.jobspider.com/job/view-resume-82320.html
html http://www.jobspider.com/job/view-resume-82320.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82318.html
html http://www.jobspider.com/job/view-resume-82318.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82317.html
html http://www.jobspider.com/job/view-resume-82317.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82316.html
html http://www.jobspider.com/job/view-resume-82316.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82305.html
html http://www.jobspider.com/job/view-resume-82305.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82302.html
html http://www.jobspider.com/job/view-resume-82302.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82300.html
html http://www.jobspider.com/job/view-resum

html http://www.jobspider.com/job/view-resume-82305.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82304.html
html http://www.jobspider.com/job/view-resume-82304.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82303.html
html http://www.jobspider.com/job/view-resume-82303.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82293.html
html http://www.jobspider.com/job/view-resume-82293.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82289.html
html http://www.jobspider.com/job/view-resume-82289.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82288.html
html http://www.jobspider.com/job/view-resume-82288.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82286.html
html http://www.jobspider.com/job/view-resume-82286.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-82285.html
html http://www.jobs

html http://www.jobspider.com/job/view-resume-81654.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81647.html
html http://www.jobspider.com/job/view-resume-81647.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81626.html
html http://www.jobspider.com/job/view-resume-81626.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81483.html
html http://www.jobspider.com/job/view-resume-81483.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81443.html
html http://www.jobspider.com/job/view-resume-81443.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81427.html
html http://www.jobspider.com/job/view-resume-81427.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81384.html
html http://www.jobspider.com/job/view-resume-81384.html downloaded, OK
connecting with http://www.jobspider.com/job/view-resume-81357.html
html http://www.jobs

### Assert that resumes form disjoint set

In [20]:
resume_set = set()

for label in labels:
    for filename in os.listdir('../resumes/' + label):
        if filename in resume_set:
            print("ERROR: " + filename + " already labeled")
            os.remove('../resumes/' + label + '/' + filename)
        resume_set.add(filename)

## Parse

In [25]:
from resumes.job_spider_resume_parser import JobSpiderHTMLResumeParser

def parse_resumes(label):
    resumes = []
    observed_resumes = listdir('../resumes/' + label)
    for fname in observed_resumes:
        parser = JobSpiderHTMLResumeParser()
        print(fname)
        f = open('../resumes/' + label + '/' + fname)
        parser.feed(f.read())

        resume_field_map = parser.get_resume()
        resume_field_map['id'] = fname
        resume = Resume(resume_field_map)
        resume.set_label(label)
        resumes.append(resume)

    return resumes

In [26]:
resumes = []
for label in labels:
    resumes.extend(parse_resumes(label))

view-resume-82047.html
view-resume-82194.html
view-resume-82239.html
view-resume-82227.html
view-resume-82278.html
view-resume-82288.html
view-resume-82316.html
view-resume-82126.html
view-resume-82090.html
view-resume-82131.html
view-resume-82067.html
view-resume-82302.html
view-resume-82212.html
view-resume-82068.html
view-resume-82161.html
view-resume-82250.html
view-resume-82072.html
view-resume-82240.html
view-resume-82065.html
view-resume-82071.html
view-resume-82116.html
view-resume-82320.html
view-resume-82291.html
view-resume-82246.html
view-resume-82237.html
view-resume-82145.html
view-resume-82121.html
view-resume-82069.html
view-resume-82305.html
view-resume-82175.html
view-resume-82099.html
view-resume-82317.html
view-resume-82066.html
view-resume-82318.html
view-resume-82300.html
view-resume-82117.html
view-resume-82289.html
view-resume-82271.html
view-resume-82078.html
view-resume-82149.html
view-resume-82244.html
view-resume-82238.html
view-resume-82269.html
view-resume

## Get embeddings

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
for resume in resumes:
    