In [1]:
import pandas as pd

from gensim.utils import simple_preprocess
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models.phrases import Phrases, Phraser

In [2]:
df = pd.read_csv('data/job_ofer.csv')

In [3]:
df.shape

(36109, 8)

In [4]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


In [5]:
title_corpus = df['title'].map(simple_preprocess)

In [18]:
title_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))

In [32]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield bigram[sent] + sent

In [33]:
title_phrase = list(prepare_corpus(title_corpus, title_bigram))

In [34]:
[(i, sent) for i,sent in enumerate(title_corpus)]

[(0, ['machine', 'learning', 'engineer']),
 (1, ['deep', 'learning', 'applied', 'researcher', 'chicago']),
 (2, ['machine', 'learning', 'engineer']),
 (3, ['machine', 'learning', 'data', 'scientist']),
 (4, ['cloud', 'architect']),
 (5, ['data', 'scientist']),
 (6, ['store', 'room', 'clerk']),
 (7, ['director', 'of', 'product', 'us']),
 (8, ['recruiting', 'manager', 'ad', 'census', 'ext', 'gb']),
 (9, ['bilingual', 'engineer', 'german', 'germany', 'or', 'switzerland']),
 (10, ['sommelier']),
 (11, ['entry', 'level', 'project', 'manager', 'shelton', 'ct', 'based']),
 (12, ['finance', 'manager', 'firestone', 'industrial', 'products']),
 (13, ['us', 'lcra', 'cardiovascular', 'remote', 'anywhere', 'in']),
 (14, ['gallagher', 'bassett', 'corporate', 'intern', 'legal']),
 (15, ['us', 'head', 'of', 'marketing']),
 (16, ['visual', 'merchandiser']),
 (17, ['community', 'intern', 'tampa', 'bay', 'fl']),
 (18, ['sr', 'innovation', 'manager']),
 (19, ['fashion', 'beauty', 'content', 'editor']),
 (

In [35]:
title_tagged = [TaggedDocument(words=sent, tags=[i]) for i,sent in enumerate(title_corpus)]

In [36]:
title_phrase_tagged = [TaggedDocument(words=sent, tags=[i]) for i,sent in enumerate(title_phrase)]

In [37]:
title_phrase_tagged

[TaggedDocument(words=['machine_learning', 'engineer', 'machine', 'learning', 'engineer'], tags=[0]),
 TaggedDocument(words=['deep_learning', 'applied', 'researcher', 'chicago', 'deep', 'learning', 'applied', 'researcher', 'chicago'], tags=[1]),
 TaggedDocument(words=['machine_learning', 'engineer', 'machine', 'learning', 'engineer'], tags=[2]),
 TaggedDocument(words=['machine_learning', 'data_scientist', 'machine', 'learning', 'data', 'scientist'], tags=[3]),
 TaggedDocument(words=['cloud_architect', 'cloud', 'architect'], tags=[4]),
 TaggedDocument(words=['data_scientist', 'data', 'scientist'], tags=[5]),
 TaggedDocument(words=['store', 'room_clerk', 'store', 'room', 'clerk'], tags=[6]),
 TaggedDocument(words=['director_of', 'product', 'us', 'director', 'of', 'product', 'us'], tags=[7]),
 TaggedDocument(words=['recruiting', 'manager_ad', 'census_ext', 'gb', 'recruiting', 'manager', 'ad', 'census', 'ext', 'gb'], tags=[8]),
 TaggedDocument(words=['bilingual', 'engineer', 'german_german

In [38]:
title_model = Doc2Vec(vector_size=300, window_size=5, min_count=1)
title_model.build_vocab(title_phrase_tagged)

In [39]:
%%time
title_model.train(title_phrase_tagged, total_examples=title_model.corpus_count, epochs=10)

CPU times: user 33.4 s, sys: 6.56 s, total: 39.9 s
Wall time: 23.4 s


In [40]:
title_model.docvecs.most_similar(0)

[(34278, 0.8869357705116272),
 (30054, 0.8818906545639038),
 (26393, 0.8800269365310669),
 (8748, 0.8647370338439941),
 (9395, 0.8528172373771667),
 (12406, 0.8276047706604004),
 (28223, 0.8259368538856506),
 (24281, 0.8247314095497131),
 (14356, 0.8174866437911987),
 (12266, 0.8142593502998352)]

In [41]:
similar_ids = [i for i,_ in title_model.docvecs.most_similar(0)]
similar_ids

[34278, 30054, 26393, 8748, 9395, 12406, 28223, 24281, 14356, 12266]

In [42]:
df[ df.index == 0].title

0    Machine Learning Engineer
Name: title, dtype: object

In [43]:
for i in similar_ids:
    print("{} -> {}".format(i, df[ df.index == i].title.values))

34278 -> ['Senior Machine Learning Engineer, Energy Platform - San Francisco, CA']
30054 -> ['Mechanical Engineer, Industrial and Mining (Senior)']
26393 -> ['Data Science Software Engineer, Self Driving']
8748 -> ['Senior Software Engineer - Analytics & Data Mining']
9395 -> ['Data/Machine Learning Software Engineer']
12406 -> ['Senior Engineer - San Francisco']
28223 -> ['Senior Deep Learning Engineer - San Francisco']
24281 -> ['Senior Data Engineer - San Francisco']
14356 -> ['Senior Javascript Engineer']
12266 -> ['Senior Computer Vision R&D Engineer']
