In [2]:
import weaviate
import weaviate.classes as wvc
import ollama
import json
import pickle
import ollama
from collections import Counter
import pandas as pd

In [9]:
client = weaviate.connect_to_local()

  client = weaviate.connect_to_local()


## Users

In [11]:
with open ('../data/abouts', 'rb') as file:
    summaries = pickle.load(file)

len(summaries)

27

In [17]:
if client.collections.exists('Jobseekers'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Jobseekers')
    
jobseeker_collection = client.collections.create(name='Jobseekers',
                                                 vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_ollama(api_endpoint='http://host.docker.internal:11434', model='llama3'),
                                                 generative_config=wvc.config.Configure.Generative.ollama(api_endpoint='http://host.docker.internal:11434', model='llama3'))

In [20]:
json_outputs = []
generation_counts = 0

for summary in summaries:

    prompt_template = f'Given the following summary of a jobseeker: "{summary}" answer the following questions: 1. Enlist the main skills of the jobseeker. \
        2. Is the jobseeker interested in working in a specific location? If yes, mention location. If no, just say "Flexible"\
        3. Is the jobseeker looking for a "Full-Time" role, an "Internship" role, or "Contract" role? If any of these, mention the type name. Else, say "Flexible".\
        4. What are the role titles that the jobseeker is interested in? Be curt, do not explain role.\
        5. Does the jobseeker have experience or interest in specific industries? If so, list the industries, if not, say "Flexible".\
        6. Does the jobseeker have a strong preference for "Remote" work? If so, say "Remote", else say "Flexible". Do not explain, only list.\
        7. Write a 100 word paragraph on the kind of team this person would excel at.\
        Structure answers into a json that can be read using Python json.loads() using the keys "skills", "location", "role_type", "interested_roles", "industries", "remote" and "team_fit" respectively. Do not include any other explanations or sentences in the output. Do not explain how to use it.'

    json_output = None

    while not json_output:

        generation_counts += 1

        output = ollama.generate(model = "llama3",
                                prompt = prompt_template)
        
        for x in output['response'].split('```'):
            try:
                json_output = json.loads(x)
                break
            except:
                pass
    
    json_output['summary'] = summary
    json_outputs.append(json_output)

print(f'It took {generation_counts} LLM calls to extract {len(summaries)} profiles.')

It took 28 LLM calls to extract 27 profiles.


[{'skills': ['Python',
   'Machine Learning',
   'Natural Language Processing',
   'SQL',
   'Tableau',
   'Database Systems',
   'Data Analytics'],
  'location': '',
  'role_type': 'Full-Time',
  'interested_roles': ['Data Analyst'],
  'industries': ['Healthcare', 'Customer Segmentation Marketing'],
  'team_fit': 'I would excel at a team that has a great culture and significant impact to business, with diverse members who appreciate data-driven insights.',
  'summary': 'Currently seeking Full time Data Analyst roles starting January 2024. Expertise domain knowledge in healthcare and customer segmentation marketing. Python, Machine Learning, Natural Language Processing, SQL, Tableau, Database Systems, Data Analytics. I enjoy programming, working with ML models, and visualizing patterns to get the most value out of data. I graduated with a Bachelor’s in Computer Engineering from University of Mumbai in 2018. As for my future, I aspire to one day work as a Data Scientist with a diverse t

In [23]:
with open('../data/extracted_summaries.json', 'w') as f:
    json.dump(json_outputs, f)

## Roles

In [14]:
match_data = pd.read_json('../data/jobs.json')
match_data

Unnamed: 0,job_id,company_name,title,description,location,type,remote,skills,industry,application_url
0,1218575,Children's Nebraska,Respiratory Therapist,"At Children’s, the region’s only full-service ...","Omaha, NE",Full-time,False,Health Care Provider,Hospitals and Health Care,www.childrensnebraska.org
1,95428182,CLEVELAND KIDS BOOK BANK,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,"Cleveland, OH",Full-time,False,Administrative,Non-profit Organizations,https://www.kidsbookbank.org/employment/
2,280496925,Washington State University,Coordinator for Multicultural Student Organiza...,The Coordinator serves as the principal adviso...,"Pullman, WA",Full-time,False,"Education, Training",Higher Education,https://wsu.wd5.myworkdayjobs.com/en-US/WSU_Jo...
3,368586246,STL Fertility,Embryologist,Job duties:To recover oocytes from follicular ...,"St Louis, MO",Full-time,False,Health Care Provider,,https://www.indeed.com/job/embryologist-944f8c...
4,805229245,,"Manager, Retail Pharmacy",SUMMARY:Manages operation and supervises all d...,"Tucson, AZ",Full-time,False,"Business Development, Sales",Hospitals and Health Care,https://jobs.tmcaz.com/manager-pharmacy-retail...
...,...,...,...,...,...,...,...,...,...,...
87173,3906266212,Synectics Inc.,Phlebotomist - Float,Job Description\n\nThe Patient Services Repres...,"Carroll County, MD",Contract,False,Science,Staffing and Recruiting,https://www.synectics.com/candidate-apply.php/...
87174,3906266217,The Dyrt,Senior Frontend/App Developer,The Dyrt is the largest digital camping platfo...,United States,Full-time,True,"Engineering, Information Technology","Technology, Information and Internet",https://the-dyrt.breezy.hr/p/31c6745b3473-seni...
87175,3906266248,GoodRx,"Account Manager, Client Success",GoodRx is America’s healthcare marketplace. Ea...,United States,Full-time,True,"Business Development, Sales",Hospitals and Health Care,https://goodrx.wd1.myworkdayjobs.com/Careers/j...
87176,3906267126,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,United States,Full-time,True,"Engineering, Information Technology","IT Services and IT Consulting, Software Develo...",https://www.pinterestcareers.com/en/jobs/58824...


In [11]:
industries = list(set(map(lambda x: x.strip(), (','.join(match_data['industry'].tolist())).split(','))))
industries.remove('')

with open('../data/industries.pkl', 'wb') as f:
    pickle.dump(industries, f)

In [15]:
roles = map(lambda x: x.strip(), (','.join(match_data['title'].tolist())).split(','))
popular_roles = list(map(lambda x: x[0], Counter(roles).most_common(10000)))
popular_roles[:5]

['Manager',
 'Director',
 'Customer Service Representative',
 'Senior Manager',
 'Retail Sales Associate']

In [23]:
roles_filtered = []

for role in popular_roles:
    is_valid = f''' 
    Is this a professional role title: "{role}".
    Answer in "Yes" or "No".  Do not explain. Bias against false positives.
    '''
    output = ollama.generate(model = "llama3", prompt = is_valid)
    if output['response'] == 'Yes' and len(role) > 3:
        refined_title = ' '.join([x for x in role.split(' ') if x not in ('Senior', 'Junior', 'Staff', 'I', 'II', 'III')])
        roles_filtered.append(refined_title)

roles_refined = []

for role in roles_filtered:

    make_generic = f''' 
    Someone listed this role on a job board: {role}.
    This role title may contain additional details about the company, the place, or seniority.
    What would be a more generic role title, skipping all the details? Just provide answer, do not explain.
    '''
    output = ollama.generate(model = "llama3", prompt = make_generic)
    roles_refined.append(output['response'])

roles_refined = list(set(roles_refined))
len(roles_refined)

In [None]:
with open('../data/roles.pkl', 'wb') as f:
    pickle.dump(roles_refined, f)