In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
from collections import defaultdict
from string import punctuation
import re
import json
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from scipy.sparse.linalg import norm as sparse_norm


## Analysis of resumes based on experience level

What are the key linguistic indicators of experience level on a resume? By inspecting resumes of different experience we can answer this question.

## Conclusions

After analyzing these resumes, we can look at characteristic words for low and high experience and then identify "upgrades", i.e. words with a similar meaning that are characteristically more "experieneced". If you're just here for the recommendations (with manual curation) go no further, here they are:

+ "assisted with" -> "provided support for"
+ "helping" or "assisting" -> "supporting"
+ "problem" -> "need"
+ "good" or "great" -> "quality"
+ "organize" -> "coordinate"
+ "customer" -> "client"
+ "task" -> "project"
+ "academic" -> "technical"
+ "communication" -> "management"
+ "cleaning" -> "maintenance"

### So, how did we get there?

## Data source

Data was taken from a leading resume board. Resumes were marked with experience level. Job titles were taken from the Bureau of Labor Statistics Occupational Outlook Handbook (BLS OOH). All job titles were searched. Documents were saved into json files based on job titles. The sections of the resume relating to job experience were saved in the json files for analysis.

After saving the file, we can export the corpus of job descriptions for analysis. These functions extract the json files and return a corpus and a corresponding list of attributes.

In [46]:
def process_all():
	"""Create a corpus and list of attributes for documents for Tfidf Fun"""
	path = 'C:/Users/matth/OneDrive/Desktop/Data/records'
	files = os.listdir(path)
	files = [file for file in files if file not in ('.DS_Store','old')]
	attr_list = []
	corpus = []
	corp_set = set()
	for file in files:
		with open(path+'/'+file) as f:
			attr_list, corpus  = proc_file(f,file,corpus,attr_list,corp_set)
	return attr_list,corpus


In [47]:
def proc_file(f,file,corpus,attr_list,corp_set):
	for line in f:
		jline = json.loads( line.strip() )
		text = proc_jline(jline)
		if not jline['title']: ##filter out empty titles
			continue
		if text:
			obj= {'level':jline['level']}
			obj['title'] = file.split('.')[0]
			if text not in corp_set:
				corp_set.add(text)
				corpus.append(text)
				attr_list.append(obj)
	return attr_list, corpus

def proc_jline(jline):
	clean_text_list = [ clean_text( we['description'], stem=False )  for we in jline['work_experience'] ]
	text = ' '.join( ct for ct in clean_text_list if len(ct)<5000 ) ## Filter out overlong descriptions
	return text


In [48]:
def clean_text(s,stem=False):
	"""Clean out the text"""
	ret = s.lower()
	ret = re.sub(r'[^a-z ]',' ',ret)
	ret = re.sub(r' +',' ',ret).strip()
	ret = re.sub(r'see more occupations related to this (activity|skill|task)','',ret)
	return ret

In [49]:
attr_list,corpus = process_all()

Let's take a little peak at our data.

In [50]:
print(attr_list[0])
print(corpus[0])
print(len(attr_list),len(corpus))

{'level': '3-5 years', 'title': 'accountants and auditors'}
employment roberts and chaplin registered auditors umhlali kwa zulu natal south africa january december progressed from an associate to a senior associate i have performed audit engagements non audit assurance engagements and agreed upon procedure engagements on diverse client industries including manufacturing mining retail recreational and non profit organizations during the training contract i achieved all saica south african institute of chartered accountants competencies required related to auditing and accounting as well as residual skills related to taxation internal audit corporate governance and managerial accounting activities performed during my work experience are application and compliance with international financial reporting standards ifrs ifrs for small and medium sized entities ifrs for sme s and the international standards on auditing isa s reporting on assessment of internal controls and deficiencies identi

Now we must featurize our data set...

In [51]:

def get_tfidf(corpus):
	tfidf_vectorizer = TfidfVectorizer( max_features=8000, stop_words='english', ngram_range=(1, 1), min_df=0.03 )
	tfidf = tfidf_vectorizer.fit_transform( corpus )
	count_vectorizer = CountVectorizer( max_features=8000, stop_words='english', ngram_range=(1, 1), min_df=0.03 )
	count = count_vectorizer.fit_transform( corpus )
	tfidf_labels = tfidf_vectorizer.get_feature_names()
	# tfidf_label_dict = { label:ind for ind,label in enumerate( tfidf_labels ) }
	count_labels = count_vectorizer.get_feature_names()
	count_label_dict = { label:ind for ind,label in enumerate( count_labels ) }
	# count_label_indices = [ tfidf_label_dict[label] for label in count_labels if label in tfidf_label_dict]
	new_count = np.zeros( tfidf.shape )
	for k,l in enumerate( tfidf_labels ):
		if l in count_label_dict:
			new_count[ : , k ] = count[ : , count_label_dict[ l ] ].todense().reshape( new_count[ : , k ].shape )
	return {
		'tfidf':tfidf,
		'count':scipy.sparse.csr_matrix(new_count),
		'labels':tfidf_labels,
	}


In [52]:
def get_corrected_counts(tfidf_dict,attr_list, counttype='tfidf', correct_for_number_collected=False):
	occs = set( a['title'] for a in attr_list )
	levels = set( a['level'] for a in attr_list )
	count_dict={}
	for occ in occs:
		subset = [a for a in attr_list if a['title']==occ ]
		for level in levels:
			count_dict[ (occ,level) ] = len( [ a for a in subset if a['level']==level ] )
	ret = np.zeros( tfidf_dict[counttype].shape )
	for i in range( ret.shape[0] ):
		correction_factor = 1.
		if correct_for_number_collected:
			correction_factor = count_dict[ ( attr_list[i]['title'] , attr_list[i]['level'] ) ]
		ret[i,:] = 1. * ( tfidf_dict[counttype][i,:] ).todense() / correction_factor
	return ret


In [53]:
tfidf_dict = get_tfidf(corpus)

We now define a function to find differences between different experience levels with respect to TFIDF scores.

In [54]:
def get_max_differences(tfidf_dict,level1,level2, attr_list, typename="count"):
	overall1 = np.array( tfidf_dict[typename][ [ k for k,o in enumerate(attr_list) if o['level']==level1 ] ,:].mean(axis=0) )[0]
	overall2 = np.array( tfidf_dict[typename][ [ k for k,o in enumerate(attr_list) if o['level']==level2 ] ,:].mean(axis=0) )[0]
	return sorted( [ ( v1-v2 , l )
				for l,v1,v2 in zip(tfidf_dict['labels'],overall1,overall2) ]
				, key=None, reverse=False)


In [55]:
result = get_max_differences(tfidf_dict,'Less than 1 year','More than 10 years',attr_list,typename='tfidf')

print(len(result))
print(result[:5])
print(result[-5:-1])

1033
[(-0.020470263352602595, 'responsible'), (-0.019076986210144718, 'training'), (-0.018795251310089675, 'management'), (-0.018471566563729162, 'staff'), (-0.01777832420012951, 'sales')]
[(0.005875586244732587, 'learned'), (0.0063766111790492, 'helped'), (0.006606537122600925, 'patients'), (0.00832738190732854, 'clean')]


In [56]:
with open('experienced_words.txt','w') as f:
	for score,word in result[:100]:
		f.write(word+'\t'+str(score)+'\n')

with open('novice_words.txt','w') as f:
	for score,word in result[-100:-1]:
		f.write(word+'\t'+str(score)+'\n')


I performed some manual editing on these files to remove words that were too specific. So now how do we get upgrade recommendations from novice to experienced? One way would be manually--or we can try something else.

## Word Vectors

Let's use pretrained word vectors (GLOVE vectors) to go from novice words to the experienced. The file is about 600MB but you can find it to download on google ("glove vectors").

In [57]:
with open('experienced_words_edited.txt') as f:
    exp_words = [line.strip().split('\t')[0] for line in f]
with open('novice_words_edited.txt') as f:
    nov_words = [line.strip().split('\t')[0] for line in f]
all_words = set(exp_words) | set(nov_words)

def get_vector_dict():
    vector_dict = {}
    with open('C:/Users/matth/OneDrive/Desktop/Data/glove.6B.200d.txt',encoding="utf8") as f:
        counter = 0
        while counter<300000:
#             if counter%10000==0: print(counter)
            counter+=1
            line = next(f)
            elements = line.strip().split(' ')
            word = elements[0]
            if word not in all_words:
                continue
            array = np.array( [float(n) for n in elements[1:]] )
            array = array/np.linalg.norm(array)
            vector_dict[word] = array
            if len(vector_dict) == len(all_words):
                break
    return vector_dict

vector_dict=get_vector_dict()

In [58]:
exp_mat = np.array( [vector_dict[word] for word in exp_words] )
nov_mat = np.array( [vector_dict[word] for word in nov_words] )

sims = np.dot( exp_mat, nov_mat.transpose() )
for k,word in enumerate(nov_words):
	temp_sim = sims[:,k]
	indices = list(np.argsort(temp_sim)[::-1])
	top_n_words = [exp_words[n] for n in indices[:5]]
	print(word+' -> '+', '.join(top_n_words))

online -> business, services, network, applications, programs
project -> projects, development, construction, program, plan
floor -> level, new, construction, installed, included
event -> annual, team, performance, special, included
filing -> claims, employees, employee, reports, clients
rooms -> facility, installed, provide, provided, equipment
day -> new, annual, special, included, including
job -> hiring, employees, business, staff, needs
national -> team, administration, department, program, new
check -> required, ensure, provide, needs, monitoring
change -> needs, policies, plan, new, control
solving -> issues, developing, implementation, development, needs
conference -> annual, team, development, issues, new
present -> provide, provided, required, new, needs
tests -> required, procedures, develop, performance, level
preparing -> planning, plan, training, responsible, provide
cleaned -> repair, installed, maintenance, equipment, implemented
work -> projects, needs, required, busin

So there you have it! These arrows point from novice to the most similar experienced words as candidates for substitution. There are certainly some strange anomalies here like project vs. projects. I'll chalk this up to a couple things:

1) Maybe experienced people have more responsibilities (plural vs. one).

2) Could be random sampling error--we may be able to get a stronger recommendation if we use a supervised learning technique like logistic regression and control the variance of the model.