# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import utils
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
!pip install gensim==3.8.3
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from tqdm import tqdm
import os
from collections import Counter
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare final data set

In [2]:
fol_dir='/content/drive/MyDrive/NLP/'

In [3]:
df_names=os.listdir(fol_dir+'data-data_sets') #get file names
df_final=pd.DataFrame() #create an empty data frame
###join data sets##
for df_name in df_names:
    df=pd.read_csv(fol_dir+'data-data_sets/'+df_name)
    df_final=pd.concat([df_final, df], axis=0,ignore_index=True)
###################

In [4]:
df_final.head()

Unnamed: 0,Job Title,Description,Responsibilities,Basic Qualifications,Preffered Qualifications,Preferred Qualifications
0,Senior Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Lead and deliver data science solutions levera...,"Bachelor’s degree in statistics, mathematics, ...",Preferred Qualifications\n Master’s degree in ...,
1,Senior Principal Data Scientist - Telecommute ...,UnitedHealthcare is a company that's on the ri...,Work with vast amounts of data from multiple s...,10+ years of experience working on data scienc...,Master’s Degree or higher in a highly quantita...,
2,Data Scientist - Telecommute,"""Combine two of the fastest-growing fields on ...","Use appropriate data sampling, data preparatio...",2+ years of hands-on professional experience i...,"BS in science, applied mathematics, (bio) stat...",
3,Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Validates data integrity and consistency acros...,"BS degree in computer science, applied mathema...",Experience with health care claims data\nExper...,
4,"Data Scientist, Optum360 - Telecommute",Combine two of the fastest-growing fields on t...,Collaborate with stakeholders to understand bu...,Undergraduate degree in any of the quantitativ...,Master’s degree\nHealth care industry experien...,


In [5]:
df_final.columns

Index(['Job Title', 'Description', 'Responsibilities', 'Basic Qualifications',
       'Preffered Qualifications', 'Preferred Qualifications'],
      dtype='object')

'Preffered Qualifications' and 'Preferred Qualifications' are two duplicated columns.So, these two columns should be combined


In [6]:
df_final["Preffered Qualifications"].fillna("", inplace = True) #replace NULL values with spaces
df_final['Preferred Qualifications'].fillna("", inplace = True) #replace NULL values with spaces
df_final['Preferred Qualifications']=df_final['Preffered Qualifications']+df_final['Preferred Qualifications'] #combine two columns
df_final=df_final.drop(['Preffered Qualifications'], axis = 1) #drop one column

In [7]:
df_final.head()

Unnamed: 0,Job Title,Description,Responsibilities,Basic Qualifications,Preferred Qualifications
0,Senior Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Lead and deliver data science solutions levera...,"Bachelor’s degree in statistics, mathematics, ...",Preferred Qualifications\n Master’s degree in ...
1,Senior Principal Data Scientist - Telecommute ...,UnitedHealthcare is a company that's on the ri...,Work with vast amounts of data from multiple s...,10+ years of experience working on data scienc...,Master’s Degree or higher in a highly quantita...
2,Data Scientist - Telecommute,"""Combine two of the fastest-growing fields on ...","Use appropriate data sampling, data preparatio...",2+ years of hands-on professional experience i...,"BS in science, applied mathematics, (bio) stat..."
3,Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Validates data integrity and consistency acros...,"BS degree in computer science, applied mathema...",Experience with health care claims data\nExper...
4,"Data Scientist, Optum360 - Telecommute",Combine two of the fastest-growing fields on t...,Collaborate with stakeholders to understand bu...,Undergraduate degree in any of the quantitativ...,Master’s degree\nHealth care industry experien...


In [8]:
df_final['ID']=[i+1 for i in range(len(df_final))] #create a ID column. ID is unique for each job
df_final=df_final[['ID','Job Title', 'Description', 'Responsibilities', 'Basic Qualifications',
       'Preferred Qualifications']] #

In [9]:
df_final.head()

Unnamed: 0,ID,Job Title,Description,Responsibilities,Basic Qualifications,Preferred Qualifications
0,1,Senior Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Lead and deliver data science solutions levera...,"Bachelor’s degree in statistics, mathematics, ...",Preferred Qualifications\n Master’s degree in ...
1,2,Senior Principal Data Scientist - Telecommute ...,UnitedHealthcare is a company that's on the ri...,Work with vast amounts of data from multiple s...,10+ years of experience working on data scienc...,Master’s Degree or higher in a highly quantita...
2,3,Data Scientist - Telecommute,"""Combine two of the fastest-growing fields on ...","Use appropriate data sampling, data preparatio...",2+ years of hands-on professional experience i...,"BS in science, applied mathematics, (bio) stat..."
3,4,Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Validates data integrity and consistency acros...,"BS degree in computer science, applied mathema...",Experience with health care claims data\nExper...
4,5,"Data Scientist, Optum360 - Telecommute",Combine two of the fastest-growing fields on t...,Collaborate with stakeholders to understand bu...,Undergraduate degree in any of the quantitativ...,Master’s degree\nHealth care industry experien...


In [10]:
df_final.to_csv(fol_dir+'data-total_job_posting.csv',index=False) #save joined file

# Data Preprocessing

In [11]:
df=pd.read_csv(fol_dir+'data-total_job_posting.csv') #load the combined data set

In [12]:
df.head()

Unnamed: 0,ID,Job Title,Description,Responsibilities,Basic Qualifications,Preferred Qualifications
0,1,Senior Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Lead and deliver data science solutions levera...,"Bachelor’s degree in statistics, mathematics, ...",Preferred Qualifications\n Master’s degree in ...
1,2,Senior Principal Data Scientist - Telecommute ...,UnitedHealthcare is a company that's on the ri...,Work with vast amounts of data from multiple s...,10+ years of experience working on data scienc...,Master’s Degree or higher in a highly quantita...
2,3,Data Scientist - Telecommute,"""Combine two of the fastest-growing fields on ...","Use appropriate data sampling, data preparatio...",2+ years of hands-on professional experience i...,"BS in science, applied mathematics, (bio) stat..."
3,4,Data Scientist - Telecommute,Combine two of the fastest-growing fields on t...,Validates data integrity and consistency acros...,"BS degree in computer science, applied mathema...",Experience with health care claims data\nExper...
4,5,"Data Scientist, Optum360 - Telecommute",Combine two of the fastest-growing fields on t...,Collaborate with stakeholders to understand bu...,Undergraduate degree in any of the quantitativ...,Master’s degree\nHealth care industry experien...


In [13]:
print(df['Description'][0])
print('*************************')
print(df['Responsibilities'][0])
print('*************************')
print(df['Basic Qualifications'][0])
print('*************************')
print(df['Preferred Qualifications'][0])

Combine two of the fastest-growing fields on the planet with a culture of performance, collaboration and opportunity and this is what you get. Leading edge technology in an industry that's improving the lives of millions. Here, innovation isn't about another gadget, it's about making health care data available wherever and whenever people need it, safely and reliably. There's no room for error. Join us and start doing your life's best work.(sm)
 
 The Consumer Pricing, Analytics & Reporting Team (CPAR) within OptumRx uses sophisticated algorithms and techniques to understand, quantify and inform our product lines of business, operations and leaders with practical insights for improving their services and operations in support of a world-class OptumRx Consumer Experience. The team uses a blend of scientific, problem solving, and quantitative skills to develop and deliver groundbreaking methods addressing critical problems in our digital environment. The CPAR team collaborates across pro

In [14]:
###replace NULL values with empty spaces###
df["Description"].fillna(" ", inplace = True)
df["Responsibilities"].fillna(" ", inplace = True)
df['Basic Qualifications'].fillna(" ", inplace = True)
df['Preferred Qualifications'].fillna(" ", inplace = True)

In [15]:
#create full job discriptions. These discriptions will be used to train the doc2vec model
df["full_job_description"]=df["Description"]+' '+df["Responsibilities"]+' '+df['Basic Qualifications']+' '+df['Preferred Qualifications']

In [16]:
#replace '\n',',','/' with spaces
df["full_job_description"]=df["full_job_description"].replace('\n',' ', regex=True)
df["full_job_description"]=df["full_job_description"].replace(',',' ', regex=True)
df["full_job_description"]=df["full_job_description"].replace('/',' ', regex=True)

In [17]:
df["full_job_description"][0]

"Combine two of the fastest-growing fields on the planet with a culture of performance  collaboration and opportunity and this is what you get. Leading edge technology in an industry that's improving the lives of millions. Here  innovation isn't about another gadget  it's about making health care data available wherever and whenever people need it  safely and reliably. There's no room for error. Join us and start doing your life's best work.(sm)    The Consumer Pricing  Analytics & Reporting Team (CPAR) within OptumRx uses sophisticated algorithms and techniques to understand  quantify and inform our product lines of business  operations and leaders with practical insights for improving their services and operations in support of a world-class OptumRx Consumer Experience. The team uses a blend of scientific  problem solving  and quantitative skills to develop and deliver groundbreaking methods addressing critical problems in our digital environment. The CPAR team collaborates across pr

In [18]:
#Change columns to lower case 
df["full_job_description"]=df["full_job_description"].str.lower()

In [19]:
df["full_job_description"][0]

"combine two of the fastest-growing fields on the planet with a culture of performance  collaboration and opportunity and this is what you get. leading edge technology in an industry that's improving the lives of millions. here  innovation isn't about another gadget  it's about making health care data available wherever and whenever people need it  safely and reliably. there's no room for error. join us and start doing your life's best work.(sm)    the consumer pricing  analytics & reporting team (cpar) within optumrx uses sophisticated algorithms and techniques to understand  quantify and inform our product lines of business  operations and leaders with practical insights for improving their services and operations in support of a world-class optumrx consumer experience. the team uses a blend of scientific  problem solving  and quantitative skills to develop and deliver groundbreaking methods addressing critical problems in our digital environment. the cpar team collaborates across pr

In [20]:
#Remove stop words
stop_words = stopwords.words('english')
df["full_job_description"] = df["full_job_description"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [21]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
df["full_job_description"][0]

"combine two fastest-growing fields planet culture performance collaboration opportunity get. leading edge technology industry that's improving lives millions. innovation another gadget making health care data available wherever whenever people need safely reliably. there's room error. join us start life's best work.(sm) consumer pricing analytics & reporting team (cpar) within optumrx uses sophisticated algorithms techniques understand quantify inform product lines business operations leaders practical insights improving services operations support world-class optumrx consumer experience. team uses blend scientific problem solving quantitative skills develop deliver groundbreaking methods addressing critical problems digital environment. cpar team collaborates across product design engineer teams data science machine learning ai. want part growing data science data engineering team helping shape digital experiences millions customers? senior data scientist work cross-functionally data

In [23]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = df.apply(
    lambda r: TaggedDocument(words=tokenize_text(r["full_job_description"]), tags=[r.ID]), axis=1)

In [24]:
print('corpus of 1st job description')
print(train_tagged[0])
print('********************')
print('BOW count of 1st job descrption')
print(Counter(tokenize_text(df["full_job_description"][0])))

corpus of 1st job description
TaggedDocument(['combine', 'two', 'fastest-growing', 'fields', 'planet', 'culture', 'performance', 'collaboration', 'opportunity', 'get', 'leading', 'edge', 'technology', 'industry', 'that', "'s", 'improving', 'lives', 'millions', 'innovation', 'another', 'gadget', 'making', 'health', 'care', 'data', 'available', 'wherever', 'whenever', 'people', 'need', 'safely', 'reliably', 'there', "'s", 'room', 'error', 'join', 'us', 'start', 'life', "'s", 'best', 'work', 'sm', 'consumer', 'pricing', 'analytics', 'reporting', 'team', 'cpar', 'within', 'optumrx', 'uses', 'sophisticated', 'algorithms', 'techniques', 'understand', 'quantify', 'inform', 'product', 'lines', 'business', 'operations', 'leaders', 'practical', 'insights', 'improving', 'services', 'operations', 'support', 'world-class', 'optumrx', 'consumer', 'experience', 'team', 'uses', 'blend', 'scientific', 'problem', 'solving', 'quantitative', 'skills', 'develop', 'deliver', 'groundbreaking', 'methods', 'ad

# Doc2vec Modeling

## Building a Vocabulary

- If dm=0, distributed bag of words (PV-DBOW) is used. if dm=1,‘distributed memory’ (PV-DM) is used
- dimension feature vectors is 300.
- negative=5 , specifies how many “noise words” should be drawn
- hs=0 , and negative is non-zero, negative sampling will be used
- min_count=2, ignores all words with total frequency lower than this
- sample=0 , the threshold for configuring which higher-frequency words are randomly down sampled

In [25]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0)
model_dbow.build_vocab([x for x in tqdm(train_tagged)])

100%|██████████| 275/275 [00:00<00:00, 330970.90it/s]


## Training a doc2vec model 

In [26]:
for epoch in range(300):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged)]), total_examples=len(train_tagged), epochs=10)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 275/275 [00:00<00:00, 248210.37it/s]
100%|██████████| 275/275 [00:00<00:00, 727259.52it/s]
100%|██████████| 275/275 [00:00<00:00, 698204.36it/s]
100%|██████████| 275/275 [00:00<00:00, 688206.21it/s]
100%|██████████| 275/275 [00:00<00:00, 739854.78it/s]
100%|██████████| 275/275 [00:00<00:00, 747042.49it/s]
100%|██████████| 275/275 [00:00<00:00, 1045724.03it/s]
100%|██████████| 275/275 [00:00<00:00, 142751.68it/s]
100%|██████████| 275/275 [00:00<00:00, 1071964.31it/s]
100%|██████████| 275/275 [00:00<00:00, 706327.99it/s]
100%|██████████| 275/275 [00:00<00:00, 1059167.68it/s]
100%|██████████| 275/275 [00:00<00:00, 701601.95it/s]
100%|██████████| 275/275 [00:00<00:00, 731410.02it/s]
100%|██████████| 275/275 [00:00<00:00, 624152.38it/s]
100%|██████████| 275/275 [00:00<00:00, 622468.21it/s]
100%|██████████| 275/275 [00:00<00:00, 775156.99it/s]
100%|██████████| 275/275 [00:00<00:00, 496100.47it/s]
100%|██████████| 275/275 [00:00<00:00, 681295.69it/s]
100%|██████████| 275/275 

In [28]:
word_vectors = model_dbow.wv
word_vectors.save(fol_dir+'world_vectors.kv')#save word vectors

# find the best job

In [29]:
reloaded_word_vectors = KeyedVectors.load(fol_dir+'world_vectors.kv') #load word vectors

In [30]:
def pre_process(txt):
  txt=txt.replace('\n',' ')
  txt=txt.replace(',',' ')
  txt=txt.replace('/',' ')

  txt=txt.lower()
  stop_words = stopwords.words('english')
  txt=' '.join([word for word in txt.split() if word not in (stop_words)])
  tokens = []
  for sent in nltk.sent_tokenize(txt):
      for word in nltk.word_tokenize(sent):
          if len(word) < 2:
              continue
          tokens.append(word.lower())
  return(tokens)

In [31]:
def get_best_matching_job(txt): 
  wmd_arry=[]
  for i in range(len(train_tagged)):
    wmd=reloaded_word_vectors.wmdistance(train_tagged[i].words,pre_process(txt)) #calculate Word Mover’s Distance
    wmd_arry.append(wmd)
  wmd_arry=np.array(wmd_arry)
  indx=np.argmin(wmd_arry)#get index of the job that has the minimmum distance with input txt
  return(df.iloc[indx])

In [32]:
txt1='Hi, I am Chanaka. I passed my Advanced level examination along with 3 A passes.I am a fresh graduate from the Faculty of Engineering, University of Moratuwa, Sri Lanka. So I am good at mathematical problem-solving and programming. Also, I have 2 years of experience in teaching mathematics for Advanced level students and 3+ years of experience in python programming. Kindly contact me for your projects. I can help you. Thank you.'
print(txt1)

Hi, I am Chanaka. I passed my Advanced level examination along with 3 A passes.I am a fresh graduate from the Faculty of Engineering, University of Moratuwa, Sri Lanka. So I am good at mathematical problem-solving and programming. Also, I have 2 years of experience in teaching mathematics for Advanced level students and 3+ years of experience in python programming. Kindly contact me for your projects. I can help you. Thank you.


In [33]:
get_best_matching_job(txt1)

ID                                                                        257
Job Title                   Senior Software Engineer - Telecommute in Mult...
Description                 Combine two of the fastest-growing fields on t...
Responsibilities                                                             
Basic Qualifications        Undergraduate degree or equivalent experience\...
Preferred Qualifications    4+ years of experience writing performance tes...
full_job_description        combine two fastest-growing fields planet cult...
Name: 256, dtype: object

In [34]:
txt2='I am a PMP Certified Project Manager with 9 years of experience in the IT industry, working in various roles such as Technical Project Manager, Business Analyst, and Software Engineer.My expertise is in preparing BRD, FRS, SRS, DFD, Requirement Gathering & Analysis, Statement of Scope, Agile Practices, UML, User stories, Use Case, and Project Management for different IT solutions.'
print(txt2)

I am a PMP Certified Project Manager with 9 years of experience in the IT industry, working in various roles such as Technical Project Manager, Business Analyst, and Software Engineer.My expertise is in preparing BRD, FRS, SRS, DFD, Requirement Gathering & Analysis, Statement of Scope, Agile Practices, UML, User stories, Use Case, and Project Management for different IT solutions.


In [35]:
get_best_matching_job(txt2)

ID                                                                         57
Job Title                   Clinical Quality Project Manager - Telecommute...
Description                 UnitedHealthcare is a company that's on the ri...
Responsibilities            Provide subject matter expertise in areas incl...
Basic Qualifications        Undergraduate Degree (or higher)\n2+ years of ...
Preferred Qualifications    PMP Certification\nCPHQ Certification\nLean Si...
full_job_description        unitedhealthcare company that's rise. we're ex...
Name: 56, dtype: object

In [74]:
print(df["full_job_description"][1])

unitedhealthcare company that's rise. we're expanding multiple directions across borders way think. innovation another gadget transforming health care industry. ready make difference? make home us start life's best work.(sm) uhc advanced research analytics (ara) organization provides data-driven insights builds advanced ai ml solutions various lines business shared services organizations within unitedhealthcare. ara team partners various entities within enterprise increase operational value optimize experience lower healthcare costs. ara team serves across wide variety business domains including claims processing payment integrity (fraud waste abuse error) consumer experience (omnichannel customer service campaigns net promoter scores) provider service models many more. senior principal data scientist role work within transactions analytics automation domain develop transformative solutions lead effective decision-making efficient execution across multiple lines business. role specific

In [65]:
get_best_matching_job(df["full_job_description"][1])

ID                                                                          2
Job Title                   Senior Principal Data Scientist - Telecommute ...
Description                 UnitedHealthcare is a company that's on the ri...
Responsibilities            Work with vast amounts of data from multiple s...
Basic Qualifications        10+ years of experience working on data scienc...
Preferred Qualifications    Master’s Degree or higher in a highly quantita...
full_job_description        unitedhealthcare company that's rise. we're ex...
Name: 1, dtype: object