Inspired by https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [101]:
import pandas as pd
import pickle

with open("../data/all_reviews.pkl","rb") as f:
    reviews = pickle.load(f)
    

In [102]:
with open("../data/job_titles.txt","r") as f:
    common_tech_titles = [ l.replace("\n","").strip() for l in f ]

import string
import re

replacement_words = {
    "it":"technology"
    ,"sr":"senior"
    ,"qa":"quality"
    ,"sr": "Senior"
    ,"jr": "Junior"
}

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
def cleanTitle(title):
    title = str(title).translate(translator)
    title = re.sub(' +', ' ',title)
    title = title.lower()
    title_split = title.split(" ")
    for key, value in replacement_words.items():
        title_split = [ value if key == word else word for word in title_split ]
    return " ".join(title_split)    

common_tech_titles = list(map(cleanTitle,common_tech_titles))

print(common_tech_titles[:5])

['cloud architect', 'cloud consultant', 'cloud product and project manager', 'cloud services developer', 'cloud software and network engineer']


In [127]:
clean_job_titles = reviews.clean_job_title.value_counts()
clean_job_titles = clean_job_titles.loc[clean_job_titles > 2].index.tolist()

clean_job_titles.extend(common_tech_titles)

print(len(clean_job_titles))

55246


In [128]:
import re

def ngrams(string, n=3):
    #string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in {}:'.format(clean_job_titles[0]))
print( ngrams(clean_job_titles[0]) )

All 3-grams in sales associate:
['sal', 'ale', 'les', 'es ', 's a', ' as', 'ass', 'sso', 'soc', 'oci', 'cia', 'iat', 'ate']


In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(clean_job_titles)
pred = vectorizer.fit(common_tech_titles)

print(len(vectorizer.get_feature_names()))

350


In [130]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [143]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.5)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 18.85239601135254


In [144]:
non_zeros = matches.nonzero()

sparserows = non_zeros[0]
sparsecols = non_zeros[1]
print(non_zeros[0].size)

517015


In [145]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [146]:
matches_df = get_matches_df(matches, clean_job_titles, top=False)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similairity
152736,cloud specialist,cloud administrator,0.580774
388580,tech support i,tech support associate,0.787945
332319,cargo agent,cargo handler,0.559046
121165,interim supervisor,material supervisor,0.533573
347550,night stockperson,stockperson,0.83193
128195,apparel cashier,cashier apparel associate,0.738204
155288,field service specialist,field specialist,0.861084
355253,mutual fund administrator,mutual fund representative,0.661279
102708,inductor,conductor,0.704738
365898,delivery driver part time,driver part time,0.799097


In [147]:
matches_df.sort_values(['similairity'], ascending=False).head(10)

Unnamed: 0,left_side,right_side,similairity
189167,senior game advisor keyholder,senior game advisor keyholder,0.99598
442503,senior game advisor keyholder,senior game advisor keyholder,0.99598
371709,software engineer java developer,software engineer java developer,0.99581
424145,software engineer java developer,software engineer java developer,0.99581
114295,técnico de infraestrutura junior,técnico de infraestrutura junior,0.995725
317790,técnico de infraestrutura junior,técnico de infraestrutura junior,0.995725
356053,customer care representative call center,customer care representative call center,0.995032
129939,customer care representative call center,customer care representative call center,0.995032
211572,executive team leader assistant store manager,executive team leader assistant store manager,0.994969
454397,executive team leader assistant store manager,executive team leader assistant store manager,0.994969


In [152]:
filtered_jobs = matches_df.loc[matches_df.left_side.isin(common_tech_titles),:]
print(filtered_jobs.shape)

pd.set_option("display.max_colwidth",500)
display(filtered_jobs.groupby("left_side")["right_side"].apply(list).reset_index())
pd.reset_option("display.max_colwidth")

(1248, 3)


Unnamed: 0,left_side,right_side
0,net developer,"[net developer, c net developer, lead net developer, senior net developer, junior net developer, dot net developer, vb net developer, asp net developer, net developer, c net developer, lead net developer, senior net developer, junior net developer, dot net developer, vb net developer, asp net developer]"
1,application developer,"[application developer ii, application developer iv, application developer lead, application developer associate, application developer analyst, senior application developer, application developer specialist, application developer intern, application developer ii, application developer iv, application developer lead, application developer associate, application developer analyst, senior application developer, application developer specialist, application developer intern]"
2,application support analyst,"[application support analyst ii, applications support analyst, senior application support analyst, application support senior analyst, application support, clinical application support analyst, application production support analyst, application support associate, application support analyst ii, applications support analyst, senior application support analyst, application support senior analyst, application support, clinical application support analyst, application production support analyst..."
3,applications engineer,"[senior applications engineer, senior applications engineer, application engineer, applications systems engineer, field applications engineer, software applications engineer, applications support engineer, applications resident engineer, senior applications engineer, senior applications engineer, application engineer, applications systems engineer, field applications engineer, software applications engineer, applications support engineer, applications resident engineer]"
4,associate developer,"[senior associate developer, developer associate, developer, developer, associate software developer, associate development manager, store developer, associate product developer, senior associate developer, developer associate, developer, developer, associate software developer, associate development manager, store developer, associate product developer]"
...,...,...
80,technology support specialist,"[technology support specialist ii, senior technology support specialist, technology support specialist intern, student technology support specialist, technology technical support specialist, technology support, technology specialist, technology support tech, technology support specialist ii, senior technology support specialist, technology support specialist intern, student technology support specialist, technology technical support specialist, technology support, technology specialist, tech..."
81,technology systems administrator,"[technology system administrator, system administrator technology , technology administrator, systems administrator, systems administrator, technology systems analyst, technology systems manager, senior systems administrator, technology system administrator, system administrator technology , technology administrator, systems administrator, systems administrator, technology systems analyst, technology systems manager, senior systems administrator]"
82,telecommunications specialist,"[telecommunication specialist, telecommunications operations specialist, communications specialist, telecommunications, communications specialist ii, telecommunications manager, telecommunications assistant, telecommunications agent, telecommunication specialist, telecommunications operations specialist, communications specialist, telecommunications, communications specialist ii, telecommunications manager, telecommunications assistant, telecommunications agent]"
83,web administrator,"[senior web administrator, lab administrator, web content administrator, administrator, administrator i, sales administrator, hr administrator, store administrator, senior web administrator, lab administrator, web content administrator, administrator, administrator i, sales administrator, hr administrator, store administrator]"
