## Fetch resumes from job spiders searches
We are going to query different key words each corresponding to a certain resume label (industry domain)
And fetch those resumes into `./resumes/<label_name>`

In [70]:
import itertools
import numpy as np
import os

from resumes.resume import Resume
from resumes.resume_fetcher import JobSpiderResumeFetcher

labels = ['software', 'sales', 'nurse']

label_dict = {
    'software': 0,
    'sales': 1,
    'nurse': 2,
}

def fetch():
    for label in labels:
        print("======================")
        print("LABEL: " + label)
        print("======================")
        print()
        for pagenumber in range(1, 10):
            fetcher = JobSpiderResumeFetcher(label, '..')
            fetcher.fetch_resumes_page(pagenumber)
            fetcher.fetch_resume_files()

In [None]:
fetch()

### Assert that resumes form disjoint set

In [32]:
resume_set = set()

for label in labels:
    for filename in os.listdir('../resumes/' + label):
        if filename in resume_set:
            print("ERROR: " + filename + " already labeled")
            os.remove('../resumes/' + label + '/' + filename)
        resume_set.add(filename)

ERROR: view-resume-81265.html already labeled
ERROR: view-resume-81848.html already labeled
ERROR: view-resume-82194.html already labeled
ERROR: view-resume-82227.html already labeled
ERROR: view-resume-80828.html already labeled
ERROR: view-resume-80855.html already labeled
ERROR: view-resume-82288.html already labeled
ERROR: view-resume-81853.html already labeled
ERROR: view-resume-80869.html already labeled
ERROR: view-resume-82316.html already labeled
ERROR: view-resume-80790.html already labeled
ERROR: view-resume-81056.html already labeled
ERROR: view-resume-81400.html already labeled
ERROR: view-resume-80850.html already labeled
ERROR: view-resume-80867.html already labeled
ERROR: view-resume-82067.html already labeled
ERROR: view-resume-81481.html already labeled
ERROR: view-resume-80971.html already labeled
ERROR: view-resume-81047.html already labeled
ERROR: view-resume-80793.html already labeled
ERROR: view-resume-81107.html already labeled
ERROR: view-resume-81919.html alre

## Parse

In [32]:
from resumes.job_spider_resume_parser import JobSpiderHTMLResumeParser

def parse_resumes(label, n):
    resumes = []
    observed_resumes = os.listdir('../resumes/' + label)
    for fname in observed_resumes[:n]:
        parser = JobSpiderHTMLResumeParser()
        print(fname)
        f = open('../resumes/' + label + '/' + fname)
        parser.feed(f.read())

        resume_field_map = parser.get_resume()
        resume_field_map['id'] = fname
        resume = Resume(resume_field_map)
        resume.set_label(label)
        resumes.append(resume)

    return resumes

In [None]:
resumes = []
for label in labels:
    resumes.extend(parse_resumes(label, 1000))

## Get embeddings

In [34]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [100]:
def get_safe(vec, i):
    if len(vec) > i:
        return vec[i]
    else:
        return 0

def mean(vectors):
    dim = len(vectors[0])
    vecn = len(vectors)
    result = [0] * dim
    for i in range(dim):
        result[i] = sum([get_safe(vectors[k], i) for k in range(vecn)]) / vecn
    return result

In [20]:
mean([[1, 4], [6, 7]])

[3.5, 5.5]

In [None]:
embeddings_map = {}

i = 1
for resume in resumes:
    #print("#" + str(i) + " processings resume: " + resume.id)
    experience = resume.experience
    print(i, end=' ')
    if i % 30 == 0:
        print()
    #print("experience length: " + str(len(experience)))
    if len(experience) == 0:
        embeddings_map[resume.id] = 0
    else:
        embeddings_map[resume.id] = mean(model.encode(experience))
    i += 1

In [None]:
resumes = list(filter(lambda r: embeddings_map[r.id] is not 0, resumes))
len(resumes)

## Clustering with k-means

In [104]:
from sklearn.cluster import KMeans

X = [[1, 1], [0, 1], [1, 0],
     [10, 9], [9, 9], [9, 10],
     [0, 8], [1, 9], [1, 8]]

embeddings_matrix = [embeddings_map[r.id] for r in resumes]

In [105]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(embeddings_matrix)
kmeans.cluster_centers_

array([[-0.08374952,  0.36573888,  1.15462919, ..., -0.10997868,
        -0.08396376,  0.16283009],
       [-0.01544761,  0.08804066,  1.47601754, ..., -0.15006935,
        -0.24182051,  0.0100128 ],
       [-0.20199351,  0.38016101,  1.11594   , ..., -0.38860819,
        -0.40620174, -0.0450025 ]])

In [107]:
def cluster(resume):
    #print(resume.id)
    #print(resume.label)
    predicted = kmeans.predict([embeddings_map[resume.id]])[0]
    #print(predicted)
    print()
    return label_dict[resume.label], predicted


In [None]:
fscore = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
for resume in resumes:
    expected, actual = cluster(resume)
    print(actual)
    print(expected)
    fscore[actual][expected] += 1

### Understanding the output
Here we have "confusion" matrix. A row in matrix represents a cluster obtained from kmeans algorithm
A column in matrix represents a real resume label (see `label_dict`). A cell `[i, j]` contains number of resumes with label `j` and which belong to cluster number `i`. It is  **expected** that each cluster is going to have major intersection with a single certain label intuitively  

In [109]:
for row in fscore:
    print(row)

[10, 11, 176]
[36, 43, 184]
[395, 265, 42]


As we can see now only `nurse` labeled resumes are clusterized differenly from other labels. A cluster which has major part of `software` labeled resumes also "spreads its influence" on those with `sales` label