In [4]:
import weaviate
import weaviate.classes as wvc
import ollama
import json
import pickle
import ollama
from collections import Counter
from itertools import compress
import pandas as pd

In [5]:
client = weaviate.connect_to_local()

## Users

In [6]:
with open ('../data/abouts', 'rb') as file:
    summaries = pickle.load(file)

len(summaries)

27

In [37]:
if client.collections.exists('Jobseekers'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Jobseekers')
    
jobseeker_collection = client.collections.create(name='Jobseekers',
                                                 vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_ollama(api_endpoint='http://host.docker.internal:11434', model='llama3'),
                                                 generative_config=wvc.config.Configure.Generative.ollama(api_endpoint='http://host.docker.internal:11434', model='llama3'))

Dropping pre-exisiting collection


In [41]:
# Used to verify whether an LLM Generated JSON meets required specifications
def verify_extraction(json_extraction, checks):

	# Parameters not validated
	checks['location'] = False
	checks['skills'] = False

	if set(json_extraction.keys()) != set(checks):
		return {check:True for check in checks}

	if checks['team_fit'] and len(json_extraction['team_fit'].split(' ')) < 100:
		checks['team_fit'] = False

	if checks['remote'] and json_extraction['remote'] in ['Remote', 'Flexible']:
		checks['remote'] = False
	
	if checks['role_type'] and json_extraction['role_type'] in ['Full-Time', 'Internship', 'Contract', 'Flexible']:
		checks['role_type'] = False
	
	if checks['interested_roles']:
		flag = False
		for role in json_extraction['interested_roles']:
			if role[-3:] not in ['yst', 'ist', 'ant'] and role[-2:] not in ['er']:
				flag = True
		checks['interested_roles'] = flag
	
	if checks['industries']:
		flag = False
		for industry in json_extraction['industries']:
			prompt = f'''
			We are trying to identify whether a term is composed of full english words, or abbreviations.

			Here are a few examples:
			"Healthcare" -> "Full",
			"Home Science" -> "Full",
			"Defense" -> "Full",
			"Intelligence" -> "Full",
			"Education Technology" -> "Full",
			"EdTech" -> "Abbreviated",
			"Fintech" -> "Abbreviated",
			"CompSci" -> "Abbreviated"

			This is a term: {industry}. Is this an abbreviated term or full form?
			Answer in "Abbreviated" or "Full" accordingly. Do not explain.
			'''
			response = ollama.generate(model = "llama3", prompt=prompt)['response']
			if 'Full' not in response:
				flag = True
				break
		checks['industries'] = flag

	return checks

In [56]:
def extract_info(summary:str):

	questions = dict(skills='Enlist the main skills of the jobseeker.',
                 location='Is the jobseeker interested in working in a specific location? If yes, mention the full name of the US State for this location. If no, just say "Flexible".',
                 role_type='Is the jobseeker looking for a "Full-Time" role, an "Internship" role, or "Contract" role? If any of these, mention the type name. Else, say "Flexible".',
                 interested_roles='What are 3 role titles that the jobseeker might be interested in? Make sure these are tangible roles, not domains. Be curt, do not explain role.',
                 industries='Does the jobseeker have experience or interest in specific industries? If so, list the industries using full form of the industry names, without using abbreviations. If it is an abbreviation, convert it to full form (for example "EdTech" to "Education Technology"). If no specific industries are found, say "Flexible". Do not explain, only list.',
                 remote='Does the jobseeker have a strong preference for "Remote" work? If so, say "Remote", else say "Flexible". Do not explain, only list.',
                 team_fit='Write a 100 word paragraph on the kind of team this person would excel at.')

	remaining_questions = questions
	checks = {question: True for question in questions}
	generation_count = 0

	json_object = {}

	while any(checks.values()):

		generation_count += 1

		remaining_questions = {key: value for key, value in questions.items() if checks[key]}
		formatted_questions = '\n'.join([f'{i}: {question}' for i, question in enumerate(remaining_questions.values(), start=1)])

		prompt = f'''
		Given the following summary of a jobseeker: "{summary}" answer the following question(s):
		{formatted_questions}
		Structure answers into a json that can be read using Python json.loads() using the following key(s): {', '.join([f"{key}" for key in remaining_questions.keys()])}. 
		Do not include any other explanations or sentences in the output. Do not explain how to use it.
		''' 

		output = ollama.generate(model = "llama3", prompt = prompt)

		for x in output['response'].split('```'):
			try:
				json_output = json.loads(x)
				for key in json_object:
					json_output[key] = json_object[key]
				checks = verify_extraction(json_output, checks)
				for key, check in checks.items():
					if not check:
						json_object[key] = json_output[key]
				if not any(checks.values()):
					break
			except:
				pass
	
	json_object['summary'] = summary
	return json_object, generation_count

In [57]:
json_outputs = []
generation_counts = 0

for summary in summaries:

    output, generation_count = extract_info(summary)
    generation_counts += generation_count
    json_outputs.append(output)

print(f'It took {generation_counts} LLM calls to extract {len(summaries)} profiles.')
json_outputs

{'skills': ['Python', 'Machine Learning', 'Natural Language Processing', 'SQL', 'Tableau', 'Database Systems', 'Data Analytics'], 'location': '', 'role_type': 'Full-Time', 'interested_roles': ['Data Analyst', 'Data Scientist'], 'industries': ['Healthcare', 'Customer Segmentation Marketing'], 'remote': 'Flexible', 'team_fit': 'I would excel at a diverse team that has a great culture and significant impact to business.'}
{'skills': False, 'location': False, 'role_type': False, 'interested_roles': False, 'industries': False, 'remote': False, 'team_fit': False}
{'skills': ['cybersecurity', 'computer languages', 'platforms', 'tools'], 'location': 'US', 'role_type': 'Full-Time', 'interested_roles': ['Cybersecurity Analyst', 'National Security Specialist', 'Infrastructure Security Engineer'], 'industries': ['Federal Government', 'Defense', 'Intelligence'], 'remote': 'Flexible', 'team_fit': 'This individual would excel in a team that values collaboration, adaptability, and continuous learning.

[{'skills': ['Python',
   'Machine Learning',
   'Natural Language Processing',
   'SQL',
   'Tableau',
   'Database Systems',
   'Data Analytics'],
  'location': '',
  'role_type': 'Full-Time',
  'interested_roles': ['Data Analyst', 'Data Scientist'],
  'industries': ['Healthcare', 'Customer Segmentation Marketing'],
  'remote': 'Flexible',
  'team_fit': 'I would excel at a diverse team that has a great culture and significant impact to business.',
  'summary': 'Currently seeking Full time Data Analyst roles starting January 2024. Expertise domain knowledge in healthcare and customer segmentation marketing. Python, Machine Learning, Natural Language Processing, SQL, Tableau, Database Systems, Data Analytics. I enjoy programming, working with ML models, and visualizing patterns to get the most value out of data. I graduated with a Bachelor’s in Computer Engineering from University of Mumbai in 2018. As for my future, I aspire to one day work as a Data Scientist with a diverse team that

In [58]:
with open('../data/extracted_summaries.json', 'w') as f:
    json.dump(json_outputs, f)

## Roles

In [14]:
match_data = pd.read_json('../data/jobs.json')
match_data

Unnamed: 0,job_id,company_name,title,description,location,type,remote,skills,industry,application_url
0,1218575,Children's Nebraska,Respiratory Therapist,"At Children’s, the region’s only full-service ...","Omaha, NE",Full-time,False,Health Care Provider,Hospitals and Health Care,www.childrensnebraska.org
1,95428182,CLEVELAND KIDS BOOK BANK,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,"Cleveland, OH",Full-time,False,Administrative,Non-profit Organizations,https://www.kidsbookbank.org/employment/
2,280496925,Washington State University,Coordinator for Multicultural Student Organiza...,The Coordinator serves as the principal adviso...,"Pullman, WA",Full-time,False,"Education, Training",Higher Education,https://wsu.wd5.myworkdayjobs.com/en-US/WSU_Jo...
3,368586246,STL Fertility,Embryologist,Job duties:To recover oocytes from follicular ...,"St Louis, MO",Full-time,False,Health Care Provider,,https://www.indeed.com/job/embryologist-944f8c...
4,805229245,,"Manager, Retail Pharmacy",SUMMARY:Manages operation and supervises all d...,"Tucson, AZ",Full-time,False,"Business Development, Sales",Hospitals and Health Care,https://jobs.tmcaz.com/manager-pharmacy-retail...
...,...,...,...,...,...,...,...,...,...,...
87173,3906266212,Synectics Inc.,Phlebotomist - Float,Job Description\n\nThe Patient Services Repres...,"Carroll County, MD",Contract,False,Science,Staffing and Recruiting,https://www.synectics.com/candidate-apply.php/...
87174,3906266217,The Dyrt,Senior Frontend/App Developer,The Dyrt is the largest digital camping platfo...,United States,Full-time,True,"Engineering, Information Technology","Technology, Information and Internet",https://the-dyrt.breezy.hr/p/31c6745b3473-seni...
87175,3906266248,GoodRx,"Account Manager, Client Success",GoodRx is America’s healthcare marketplace. Ea...,United States,Full-time,True,"Business Development, Sales",Hospitals and Health Care,https://goodrx.wd1.myworkdayjobs.com/Careers/j...
87176,3906267126,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,United States,Full-time,True,"Engineering, Information Technology","IT Services and IT Consulting, Software Develo...",https://www.pinterestcareers.com/en/jobs/58824...


In [11]:
industries = list(set(map(lambda x: x.strip(), (','.join(match_data['industry'].tolist())).split(','))))
industries.remove('')

with open('../data/industries.pkl', 'wb') as f:
    pickle.dump(industries, f)

In [15]:
roles = map(lambda x: x.strip(), (','.join(match_data['title'].tolist())).split(','))
popular_roles = list(map(lambda x: x[0], Counter(roles).most_common(10000)))
popular_roles[:5]

['Manager',
 'Director',
 'Customer Service Representative',
 'Senior Manager',
 'Retail Sales Associate']

In [23]:
roles_filtered = []

for role in popular_roles:
    is_valid = f''' 
    Is this a professional role title: "{role}".
    Answer in "Yes" or "No".  Do not explain. Bias against false positives.
    '''
    output = ollama.generate(model = "llama3", prompt = is_valid)
    if output['response'] == 'Yes' and len(role) > 3:
        refined_title = ' '.join([x for x in role.split(' ') if x not in ('Senior', 'Junior', 'Staff', 'I', 'II', 'III')])
        roles_filtered.append(refined_title)

roles_refined = []

for role in roles_filtered:

    make_generic = f''' 
    Someone listed this role on a job board: {role}.
    This role title may contain additional details about the company, the place, or seniority.
    What would be a more generic role title, skipping all the details? Just provide answer, do not explain.
    '''
    output = ollama.generate(model = "llama3", prompt = make_generic)
    roles_refined.append(output['response'])

roles_refined = list(set(roles_refined))
len(roles_refined)

In [None]:
with open('../data/roles.pkl', 'wb') as f:
    pickle.dump(roles_refined, f)