Inspired by https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [119]:
import sys
import os

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path+"/scripts")

In [120]:
import pandas as pd
import pickle

with open("../data/all_reviews.pkl","rb") as f:
    reviews = pickle.load(f)
    

In [121]:
with open("../data/job_titles.txt","r") as f:
    common_tech_titles = [ l.replace("\n","").strip() for l in f ]

import string
import re
from DataPrep import cleanTitle

common_tech_titles = list(map(cleanTitle,common_tech_titles))

print(common_tech_titles[:5])

['cloud architect', 'cloud consultant', 'cloud product project manager', 'cloud services developer', 'cloud software network engineer']


## N-Gram with Top N Cosine

In [145]:
# clean_job_titles = reviews.clean_job_title.tolist()
clean_job_titles_filtered = reviews.clean_job_title.value_counts()
clean_job_titles_filtered = clean_job_titles_filtered.loc[clean_job_titles_filtered > 1].index.tolist()

clean_job_titles_filtered.extend(common_tech_titles)

print(len(clean_job_titles_filtered))

107280


In [146]:
import re

def ngrams(string, n=3):
    #string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in {}:'.format(clean_job_titles_filtered[0]))
print( ngrams(clean_job_titles_filtered[0]) )

All 3-grams in sales associate:
['sal', 'ale', 'les', 'es ', 's a', ' as', 'ass', 'sso', 'soc', 'oci', 'cia', 'iat', 'ate']


In [181]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(clean_job_titles_filtered)

print(len(vectorizer.get_feature_names()))

9122


In [182]:
print(clean_job_titles_filtered[0])
print( clean_jobs_tf_idf_matrix[0] )

sales associate
  (0, 284)	0.33873727514737445
  (0, 281)	0.33873727514737445
  (0, 267)	0.2498248993008131
  (0, 210)	0.33873727514737445
  (0, 138)	0.33873727514737445
  (0, 110)	0.33873727514737445
  (0, 61)	0.19573351532839325
  (0, 44)	0.33873727514737445
  (0, 41)	0.30996019042233774
  (0, 3)	0.33873727514737445


In [151]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [183]:
# import time
# t1 = time.time()
# matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.7)
# t = time.time()-t1
# print("SELFTIMED:", t)

SELFTIMED: 90.5202579498291


In [200]:
from sparse_dot_topn import awesome_cossim_topn
import time

t1 = time.time()

matches = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.7, use_threads=True, n_jobs=11)

t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 34.36682605743408


In [185]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [201]:
matches_df = get_matches_df(matches, clean_job_titles_filtered, top=False)
# matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similairity
378802,computers mobile sales lead,mobile sales lead,0.726617
404347,dsl technical support specialist,technical support specialist,0.786849
188238,quality operations manager,quality operator,0.765791
538750,manager aml compliance,compliance aml analyst,0.770264
78230,teller associate,bank teller associate,0.730815
352465,programmer analyst supervisor,programmer analyst,0.827338
221605,special services desk supervisor,supervisor special services,0.739409
86958,summer camp counselor,summer camp counselor,1.0
335574,starbucks barista shift supervisor,starbucks shift supervisor,0.841543
341456,international clerk,international,0.784819


In [187]:
matches_df.sort_values(['similairity'], ascending=True).head(10)

Unnamed: 0,left_side,right_side,similairity
71935,certified medical coder,lead certified medical assistant,0.7
497538,nursery assistant,nurses assistant,0.700001
104188,nurses assistant,nursery assistant,0.700001
595444,team leader collector,collections team lead,0.700001
486203,specialties customer service,special services customer service specialist,0.700001
248480,practice administrator managing partner,practice administrator,0.700002
37129,practice administrator,practice administrator managing partner,0.700002
479340,commission sales associate shoe department,sales shoe department,0.700002
370968,marketing merchandise team leader,manager merchandise marketing,0.700003
71392,support manger,overnight support manger,0.700004


In [215]:
filtered_jobs = matches_df.loc[matches_df.left_side.isin(common_tech_titles),["left_side","right_side"]]
print(filtered_jobs.shape)

pd.set_option("display.max_colwidth",500)
display(filtered_jobs.groupby("left_side")["right_side"].apply(list).reset_index())
pd.reset_option("display.max_colwidth")

jobs = filtered_jobs.groupby("left_side")["right_side"].apply(list).reset_index().right_side.tolist()


(1361, 2)


Unnamed: 0,left_side,right_side
0,application developer,"[application developer, application developer, application developer sap, application developer lead, application developer associate, application developer analyst, application developer specialist, net application developer, lead application developer, rational application developer, application developer, application developer, application developer sap, application developer lead, application developer associate, application developer analyst, application developer specialist, net applic..."
1,application support analyst,"[application support analyst, application support analyst, applications support analyst, lead application support analyst, technical application support analyst, clinical application support analyst, application support, business application support analyst, application production support analyst, production application support analyst, application support analyst, application support analyst, applications support analyst, lead application support analyst, technical application support analy..."
2,applications engineer,"[applications engineer, applications engineer, applications engineer, applications engineer sales engineering, applications engineer intern, application operations engineer, application sales engineer, application engineer, technical applications engineer, applications engineering intern, applications engineer, applications engineer, applications engineer, applications engineer sales engineering, applications engineer intern, application operations engineer, application sales engineer, appli..."
3,associate developer,"[associate developer, associate developer, developer associate, developer developer, developer, developer, software associate developer, associate development manager, store developer, associate software developer, associate developer, associate developer, developer associate, developer developer, developer, developer, software associate developer, associate development manager, store developer, associate software developer]"
4,chief information officer,"[chief information officer, chief information officer, chief information office, chief information officer cio, chief information security officer, chief information systems, chief information officer, chief information officer, chief information office, chief information officer cio, chief information security officer, chief information systems]"
...,...,...
67,technology systems administrator,"[technology systems administrator, technology administrator, systems administrator, systems administrator, tier systems administrator, lead systems administrator, information technology system administrator, associate systems administrator, quality systems administrator, assistant systems administrator]"
68,telecommunications specialist,"[telecommunications specialist, telecommunications specialist, telecommunication specialist, telecommunications operations specialist, communications specialist, telecommunications, telecommunications sales, telecommunications specialist project manager, telecommunications manager, sales communications specialist, telecommunications specialist, telecommunications specialist, telecommunication specialist, telecommunications operations specialist, communications specialist, telecommunications,..."
69,web administrator,"[web administrator, web administrator, web administrator, lab administrator, web content administrator, web administrator, web administrator, web administrator, lab administrator, web content administrator, web administrator, web administrator, web administrator, lab administrator, web content administrator]"
70,web developer,"[web developer, web developer, web developer, web developer developer, web developer analyst, web developer intern, web application developer web developer, lead web developer, net web developer, associate web developer, web developer, web developer, web developer, web developer developer, web developer analyst, web developer intern, web application developer web developer, lead web developer, net web developer, associate web developer, web developer, web developer, web developer, web develo..."


In [156]:
sum(reviews.clean_job_title.isin(filtered_jobs.right_side.tolist()))

36059

## TF-IDF with K-Means

In [157]:

clean_job_titles = reviews.clean_job_title.unique().tolist()
print(len(clean_job_titles))

512896


In [158]:
import re

def ngrams(string, n=3):
    #string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in {}:'.format(common_tech_titles[0]))
print( ngrams(common_tech_titles[0]) )

All 3-grams in cloud architect:
['clo', 'lou', 'oud', 'ud ', 'd a', ' ar', 'arc', 'rch', 'chi', 'hit', 'ite', 'tec', 'ect']


In [159]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(common_tech_titles)

print(len(vectorizer.get_feature_names()))

340


In [160]:
from sklearn.neighbors import NearestNeighbors
import time

nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tf_idf_matrix)

In [161]:
def getNearestN(query):
    queryTFIDF_ = vectorizer.transform(query)
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

t1 = time.time()
print('getting nearest n...')
distances, indices = getNearestN(clean_job_titles)
t = time.time()-t1
print("COMPLETED IN:", t)

getting nearest n...
COMPLETED IN: 7.6167519092559814


In [203]:
print('finding matches...')
kmeans_matches = []
for i,j in enumerate(indices):
    temp = [round(distances[i][0],2), common_tech_titles[j[0]],clean_job_titles[i]]
    kmeans_matches.append(temp)
print('Building data frame...')  
kmeans_matches = pd.DataFrame(kmeans_matches, columns=['Match confidence (lower is better)','left_side','right_side'])
print('Done')

finding matches...
Building data frame...
Done


In [214]:
idx = kmeans_matches["Match confidence (lower is better)"] < .4
kmeans_filtered_jobs = kmeans_matches.loc[idx,["left_side","right_side"]]
print(sum(idx))

pd.set_option("display.max_colwidth",700)
display( kmeans_filtered_jobs.groupby("left_side").right_side.apply(list).reset_index() )
pd.reset_option("display.max_colwidth")

1526


Unnamed: 0,left_side,right_side
0,application developer,"[application developer, sap abap application developer, mobile application developer, principal application developer, oracle application developer, application developer latam, application develope, application developer lead, lead application developer, application developer head, android application developer, ios application developer, mean stack application developer, application developer sql, android mobile application developer, ios mobile application developer, application developer mobile ios, application developer j2ee, tririga application developer, application developer sap, application developer ios, application developer qlikview cognos, application developer oracle apps, ..."
1,application support analyst,"[application support analyst, business application support analyst, hcm application support analyst, peachtree accounting application support analyst, unix application support analyst, mainframe application support analyst, application support analyst oracle apps, application support analyst team lead, application support business analyst, level application support analyst, application support analysis qnxt, tax support analyst cash applications, linux application support analyst, lead application support analyst, business analyst application support, analyst application support, application support analyst offshore lead, level banking application support analyst, system application supp..."
2,applications engineer,"[applications engineer, delcam applications engineer, staff field applications engineer, field applications engineer, applications engineer hvcb, applications engineer cad cam, applications tailoring engineer, foundry applications engineer, principal applications engineer, principle applications engineer, lighting applications engineer, oracle applications engineer, sas applications engineer, applications engineer cvp, sales applications engineer, lead applications engineer]"
3,associate developer,"[associate developer, associate developer dot net, associate oracle sql developer, associate etl developer, associate sql etl developer, associate sql developer, principal associate developer, associate full stack developer, associate developer full time]"
4,chief information officer,"[chief information officer, interim chief information officer, chief information office, chief information officer principal, chief information officer cio, chief information office cio, chief information officer west region, regional chief information officer]"
...,...,...
59,technology support specialist,"[technology support specialist, technology support sales specialist, course technology support specialist, catlab technology support specialist, technology support specialist tier2, technology support specialist cashier, technology support billing specialist, android technology support specialist, billing technology support specialist, lms technology support specialist, verse technology support specialist, infra technology support specialist, technology support specialist level, technology support specialist sme, technology support specialist tss, kindle technology support specialist where, sprint technology support specialist, field technology support specialist, remote technology suppo..."
60,telecommunications specialist,"[telecommunications specialist, lead telecommunications specialist, telecommunications repair specialist, telecommunications account specialist, military telecommunications specialist, analyst telecommunications specialist, telecommunications systems specialist, telecommunications specialist csr]"
61,web administrator,"[web administrator, marketing web administrator, training advisor•web administrator, web hosting administrator]"
62,web developer,"[web developer, lead web developer, full stack web developer, web developer lead web developer, php web developer, ctl web developer, web developer oracle, web developer html5 developer, drupal web developer, angular web developer, mobile web developer, web developer fullstack, role web developer, global web developer, web developer sme, oracle sql web developer, j2ee web developer, staff web developer ebt, eftps web developer, sfdc web developer, web developer developer]"


In [206]:
sum(reviews.clean_job_title.isin(kmeans_filtered_jobs.right_side.tolist()))

36406

Merge both datsets

In [225]:

finalized_job_list = pd.concat([filtered_jobs, kmeans_filtered_jobs]).drop_duplicates()

finalized_job_list.shape

(1942, 2)

In [227]:
sum(reviews.clean_job_title.isin(finalized_job_list.right_side.tolist()))

43645

In [230]:
finalized_job_list.head()

finalized_job_list.to_csv("../data/finalized_job_list.csv", index=False)

Remove Jobs...

In [233]:
filter_titles = finalized_job_list.right_side.tolist()

exclude_keywords = [
    "camp"
    ,"tutor"
    ,"professor"
    ,"entry"
    ,"entries"
    ,"teacher"
    ,"audio"
    ,"civil"
    ,"facility"
    ,"facilities"
    ,"industrial"
    ,"metals"  
    ,"lab"
]

def excludeJobTItles(job):
    tokens = str(job).split(" ")
    return len( [ word for word in tokens if word in exclude_keywords ] ) == 0

filter_titles_2 = list( filter(excludeJobTItles, filter_titles) )

In [234]:
sum(reviews.clean_job_title.isin(filter_titles_2))

43632