In [8]:
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
from weaviate.connect import ConnectionParams
from weaviate.util import generate_uuid5
import ollama
import os
import pickle
import json

In [3]:
client = weaviate.connect_to_local(additional_config=wvc.init.AdditionalConfig(timeout=(60, 7500)))

## Jobs

In [5]:
with open('../data/jobs.json') as f:
    jobs = json.load(f)

len(jobs)

87178

In [6]:
jobs[0]

{'job_id': '1218575',
 'company_name': "Children's Nebraska",
 'title': 'Respiratory Therapist',
 'description': "At Children’s, the region’s only full-service pediatric healthcare center, our people make us the very best for kids. Come cultivate your passion, purpose and professional development in an environment of excellence and inclusion, where team members are supported and deeply valued. Opportunities for career growth abound as we grow our services and spaces, including the cutting-edge Hubbard Center for Children. Join our highly engaged, caring team—and join us in providing brighter, healthier tomorrows for the children we serve. Children's is committed to diversity and inclusion. We are an equal opportunity employer including veterans and people with disabilities.\nA Brief OverviewProvides appropriate respiratory care specific to the pediatric population in accordance with the hospital policy/procedure. Assesses, plans and implements appropriate respiratory plan of care based

In [7]:
if client.collections.exists('Jobs'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Jobs')
    
jobs_collection = client.collections.create(name='Jobs',
                                            description='various job postings',
                                            vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
                                                                                                                 model='nomic-embed-text',
                                                                                                                 vector_index_config=wvc.config.Configure.VectorIndex.hnsw(quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(),
                                                                                                                                                                           distance_metric=wvc.config.VectorDistances.L2_SQUARED),
                                                                                                                 name='description_vector', 
                                                                                                                 source_properties=['description', 'skills', 'industry', 'title'], 
                                                                                                                 vectorize_collection_name=False)],
                                            generative_config=wvc.config.Configure.Generative.ollama(api_endpoint='http://host.docker.internal:11434', model='llama3'),
                                            reranker_config=wvc.config.Configure.Reranker.cohere(model='rerank-english-v3.0'),
                                            properties=[wvc.config.Property(name='job_id', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='company_name', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='title', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='description', data_type=wvc.config.DataType.TEXT, tokenization=wvc.config.Tokenization.WHITESPACE),
                                                        wvc.config.Property(name='location', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='type', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='remote', data_type=wvc.config.DataType.BOOL),
                                                        wvc.config.Property(name='skills', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='industry', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='application_url', data_type=wvc.config.DataType.TEXT)])

with jobs_collection.batch.dynamic() as batch:
    for job in jobs:
        obj_uuid = generate_uuid5(job)
        batch.add_object(properties=job,
                         uuid=obj_uuid)
print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [21]:
# if client.collections.exists('Roles'):
#     print('Dropping pre-exisiting collection')
#     client.collections.delete('Roles')
    
# roles_collection = client.collections.create(name='Roles',
#                                             description='Distinct roles',
#                                             vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
#                                                                                                                  model='nomic-embed-text',
#                                                                                                                  vector_index_config=wvc.config.Configure.VectorIndex.flat(quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(),
#                                                                                                                                                                            distance_metric=wvc.config.VectorDistances.L2_SQUARED),
#                                                                                                                  name='role_vector', 
#                                                                                                                  source_properties=['role'], 
#                                                                                                                  vectorize_collection_name=False)],
#                                             properties=[wvc.config.Property(name='role', data_type=wvc.config.DataType.TEXT)])

# roles_collection.data.insert_many(roles)
# print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [22]:
# with open('../data/industries.pkl', 'rb') as f:
#     industries = pickle.load(f)
# industries = [{'industry': industry} for industry in industries]
# industries[0]

{'industry': 'Medical Equipment Manufacturing'}

In [43]:
# len(industries)

422

In [44]:
# if client.collections.exists('Industries'):
#     print('Dropping pre-exisiting collection')
#     client.collections.delete('Industries')
    
# industries_collection = client.collections.create(name='Industries',
#                                             description='distinct industries',
#                                             vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
#                                                                                                                  model='nomic-embed-text',
#                                                                                                                  vector_index_config=wvc.config.Configure.VectorIndex.flat(),
#                                                                                                                  name='industry_vector', 
#                                                                                                                  source_properties=['industry'], 
#                                                                                                                  vectorize_collection_name=False)],
#                                             properties=[wvc.config.Property(name='industry', data_type=wvc.config.DataType.TEXT)])

# industries_collection.data.insert_many(industries)
# print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [26]:
with open('../data/extracted_summaries.json') as f:
    summaries = json.load(f)

In [27]:
i = 2
summaries[i]

{'skills': ['Python',
  'C++',
  'Java',
  'R',
  'SQL',
  'Keras',
  'PyTorch',
  'Tensorflow',
  'Flask',
  'FastAPI',
  'AWS',
  'Azure',
  'Docker',
  'NATS',
  'Git',
  'Jira'],
 'location': '',
 'role_type': 'Flexible',
 'interested_roles': ['Machine Learning Engineer'],
 'industries': ['Healthcare', 'EdTech', 'Supply Chain'],
 'remote': 'Flexible',
 'team_fit': "I would excel at a team that values innovation, collaboration, and continuous learning. A team that is passionate about applying AI to real-world problems and making a positive impact on people's lives.",
 'summary': 'As a Machine Learning Engineer at Chegg, I apply my passion and expertise in Generative AI, Natural Language Processing, and Deep Learning to develop state-of-the-art AI applications that empower students and educators. I have built deep learning models for question answering, topic prediction, and author extraction, leveraging LLMs, CNNs, and traditional ML models. I am currently working on CheggMate, a re

In [28]:
query = summaries[i]['summary']
roles = summaries[i]['interested_roles']

response = jobs_collection.query.hybrid(query=query,
                                        query_properties=['description^2', 'skills', 'title'],
                                        fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
                                        target_vector='description_vector',
                                        vector=wvc.query.HybridVector.near_text(query=query, move_to=wvc.query.Move(force=0.5, concepts=roles)),
                                        return_metadata=wvc.query.MetadataQuery(score=True),
                                        alpha=0.6,
                                        limit=50,
                                        auto_limit=4,
                                        rerank=wvc.query.Rerank(prop='description', query=query)
                                        )

print('QUERY:\n',query)
print('\nRETRIEVAL:')

for object in response.objects[:5]:
    display(object.properties['title'])
    display(object.properties['description'])
    print(f'Score: {object.metadata.score:.3f}')

QUERY:
 As a Machine Learning Engineer at Chegg, I apply my passion and expertise in Generative AI, Natural Language Processing, and Deep Learning to develop state-of-the-art AI applications that empower students and educators. I have built deep learning models for question answering, topic prediction, and author extraction, leveraging LLMs, CNNs, and traditional ML models. I am currently working on CheggMate, a revolutionary AI-powered study assistant that powers student learning experiences and helps them master any subject. I have a Master of Science in Computer Science, specializing in Artificial Intelligence, from Northeastern University, where I gained a solid foundation in Deep Learning, Machine Learning, Computer Vision, and Natural Language Processing. I also have 2.5 years of experience in Research and Applied AI in Healthcare, EdTech, and supply chain, working with fast-paced startups and prominent Big Tech corporations. I have strong skills in Python, C++, Java, R, SQL, Ker

' Sr GenAI / Machine Learning Engineer'

"Responsibilities:\nDesign and implement machine learning models and algorithms to address business challenges, with a focus on Generative AI applications.Collaborate with data scientists and domain experts to understand requirements, gather data, and develop solutions that meet business objectives.Develop and maintain scalable and efficient codebase for training, testing, and deploying machine learning models in production environments.Conduct research and experiment with new techniques and algorithms to improve model performance and accuracy.Work closely with software engineers and DevOps teams to integrate machine learning models into existing systems and infrastructure.Collaborate with stakeholders to understand feedback and iterate on machine learning models to continuously improve performance and user experience.Mentor junior team members and provide technical guidance and support on machine learning best practices and methodologies.Stay up-to-date with the latest advancements in

Score: 0.530


'Senior Machine Learning Engineer, Generative AI'

"Join a leading fintech company that’s democratizing finance for all.\n\nRobinhood was founded on a simple idea: that our financial markets should be accessible to all. With customers at the heart of our decisions, Robinhood is lowering barriers and providing greater access to financial information. Together, we are building products and services that help create a financial system everyone can participate in.\n\nAs we continue to build...\n\nWe’re seeking curious, growth minded thinkers to help shape our vision, structures and systems; playing a key-role as we launch into our ambitious future. If you’re invigorated by our mission, values, and drive to change the world — we’d love to have you apply.\n\nAbout the team + role \n\nWe are seeking a dedicated and ambitious individual to accelerate the development and expansion of products powered by Gen AI to democratize finance at an unprecedented pace. In this role, you'll play a key part in Robinhood’s forward trajectory, collaborating c

Score: 0.279


'Lead Machine Learning Engineer'

"Who We Are\n\nWe are a small deep-tech startup with specialized capacities in behavior change solutions, offering a proven combination of behavioral psychology, data analytics, and digital communications. Our company focuses effectively on creating science-based solutions that tend to embrace social purpose and enhance the quality of people’s lives by creating a variety of products that use cutting-edge machine learning and data science methods to model, segment, and create the products.\n\nWe are seeking a Lead Machine Learning Engineer interested in solving the most challenging problems leveraging Machine Learning, especially Natural Language Processing (NLP) and Natural Language Understanding (NLU), and information retrieval including topical classification, sentiment analysis, and behaviors (user intent detection). Looking for people curious and love problem-solving. You have to like building end-to-end products that have a focus on ethics and reliability.\n\nWe are looking for Se

Score: 0.247


'Senior AI Deep Learning Engineer - REMOTE'

"Dice is the leading career destination for tech experts at every stage of their careers. Our client, Perficient, Inc., is seeking the following. Apply via Dice today!\n\nWe currently have an exciting career opportunity for a Senior AI Deep Learning Engineer. While our headquarters location is St. Louis, MO, this position is remote and can be based anywhere within the United States and will be working ET zones.\n\nPerficient is always looking for the best and brightest talent and we need you! We're a quickly-growing, global digital consulting leader, and we're transforming the world's largest enterprises and biggest brands. You'll work with the latest technologies, expand your skills, and become a part of our global community of talented, diverse, and knowledgeable colleagues.\n\nPerficient is always looking for the best and brightest talent and we need you! We're a quickly growing, global digital consulting leader, and we're transforming the world's largest enterprises and biggest bra

Score: 0.400


'Senior AI Deep Learning Engineer - REMOTE'

'Job Description\n\nWe currently have an exciting career opportunity for a Senior AI Deep Learning Engineer. While our headquarters location is St. Louis, MO, this position is remote and can be based anywhere within the United States and will be working ET zones.\n\nPerficient is always looking for the best and brightest talent and we need you! We’re a quickly-growing, global digital consulting leader, and we’re transforming the world’s largest enterprises and biggest brands. You’ll work with the latest technologies, expand your skills, and become a part of our global community of talented, diverse, and knowledgeable colleagues.\n\nResponsibilities\n\nMachine Learning Development\n\nMaintains, as well as furthers, enhances existing machine learning modules for automotive applications including autonomous vehicles. Designs and implements new machine learning based approaches based on existing frameworks. Keeps up to speed with the state of the art of academic research and AI/ML technolo

Score: 0.386


## Roles

In [85]:
with open('../data/roles.pkl', 'rb') as f:
    roles = pickle.load(f)
roles = [{'role': role} for role in roles]
roles[0]

{'role': 'QA Tester'}

In [86]:
if client.collections.exists('Roles'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Roles')
    
roles_collection = client.collections.create(name='Roles',
                                            description='Distinct roles',
                                            vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
                                                                                                                 model='nomic-embed-text',
                                                                                                                 vector_index_config=wvc.config.Configure.VectorIndex.flat(quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(),
                                                                                                                                                                           distance_metric=wvc.config.VectorDistances.L2_SQUARED),
                                                                                                                 name='role_vector', 
                                                                                                                 source_properties=['role'], 
                                                                                                                 vectorize_collection_name=False)],
                                            properties=[wvc.config.Property(name='role', data_type=wvc.config.DataType.TEXT)])

roles_collection.data.insert_many(roles)
print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [94]:
i = 8
summaries[i]

{'skills': ['Machine Learning',
  'ML Operations',
  'Keras',
  'Tensorflow (core, JS, Lite)',
  'Scikit-Learn',
  'Computer Vision',
  'Web Development',
  'Django',
  'Flask',
  'CI/CD',
  'Python/C/C++/Java',
  'Cloud Tech (AWS, Azure, GCP)'],
 'location': 'Flexible',
 'role_type': 'Full-time',
 'interested_roles': ['Software Engineering',
  'Data Science',
  'ML',
  'DL',
  'ML Ops'],
 'industries': ['Flexible'],
 'remote': 'Flexible',
 'team_fit': 'This person would excel at a team that values innovation, collaboration, and adaptability. They are well-suited for a fast-paced environment with tight deadlines and have a keen eye for product and user experience.',
 'summary': "I have 3 years of experience in building Machine Learning/Deep Learning applications (web/mobile/edge) and designing pipelines for end-to-end ML deployments (ML Ops). Currently I'm pursuing my Masters Degree at Northeastern University (Boston, US), on track to graduate in May 2024 with a major in Computer Scien

In [106]:
detected_roles = {}
for role in summaries[i]['interested_roles']:

    response = roles_collection.query.hybrid(query=role,
                                            query_properties=['role'],
                                            fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
                                            target_vector='role_vector',
                                            return_metadata=wvc.query.MetadataQuery(score=True),
                                            alpha=0.3,
                                            limit=5)
    
    detected_roles[role] = response.objects

for generated_role, detected_role in detected_roles.items():

    # print(detected_role)
    
    for role in detected_role:
        print(f'{generated_role}: {role.properties["role"]}')
        print(f'Score: {role.metadata.score:.3f}')
    print()

Software Engineering: Software Engineering Manager
Score: 0.980
Software Engineering: Software Engineering Director
Score: 0.700
Software Engineering: Software Manager
Score: 0.484
Software Engineering: Engineering Supervisor
Score: 0.452
Software Engineering: Shop Manager
Score: 0.300

Data Science: Data Science Internship
Score: 0.936
Data Science: Data Science Manager
Score: 0.700
Data Science: Decision Science Analyst
Score: 0.445
Data Science: Data Technician
Score: 0.414
Data Science: Science Teacher
Score: 0.405

ML: AI/ML Lead
Score: 0.700
ML: LPN
Score: 0.300
ML: Assistant Controller
Score: 0.261
ML: Assistant
Score: 0.260
ML: HV Technician
Score: 0.252

DL: Retail Sales Associate
Score: 0.300
DL: Service Director
Score: 0.265
DL: Cloud Authentication Engineer
Score: 0.249
DL: Music Therapist
Score: 0.246
DL: Labor Lawyer
Score: 0.245

ML Ops: AI/ML Lead
Score: 0.700
ML Ops: Product Marketing Coordinator
Score: 0.300
ML Ops: Supply Chain Analyst
Score: 0.273
ML Ops: Board Dire

## Industries

In [58]:
i = 5
summaries[i]

{'skills': ['Data Science',
  'Business Administration',
  'Social Innovation and Entrepreneurship'],
 'location': '',
 'role_type': 'Internship',
 'interested_roles': ['Tech', 'Finance'],
 'industries': ['Tech', 'Finance'],
 'remote': 'Flexible',
 'team_fit': 'I am a graduating senior at Northeastern studying Data Science & Business Administration, with a concentration in social innovation and entrepreneurship. Currently searching for Summer 2024 internships in Tech and Finance before returning to Northeastern as a graduate student to complete a Masters in Data Science as part of the PlusOne program.',
 'summary': 'I am a graduating senior at Northeastern studying Data Science & Business Administration, with a concentration in social innovation and entrepreneurship. Currently searching for Summer 2024 internships in Tech and Finance before returning to Northeastern as a graduate student to complete a Masters in Data Science as part of the PlusOne program.'}

In [59]:
with open('../data/industries.pkl', 'rb') as f:
    industries = pickle.load(f)
industries = [{'industry': industry} for industry in industries]
industries[0]

{'industry': 'Medical Equipment Manufacturing'}

In [79]:
if client.collections.exists('Industries'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Industries')
    
industries_collection = client.collections.create(name='Industries',
                                            description='distinct industries',
                                            vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
                                                                                                                 model='llama3',
                                                                                                                 vector_index_config=wvc.config.Configure.VectorIndex.flat(distance_metric=wvc.config.VectorDistances.COSINE),
                                                                                                                 name='industry_vector', 
                                                                                                                 source_properties=['industry'], 
                                                                                                                 vectorize_collection_name=False)],
                                            properties=[wvc.config.Property(name='industry', data_type=wvc.config.DataType.TEXT)])

industries_collection.data.insert_many(industries)
print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [83]:
detected_industries = {}

for industry in summaries[i]['industries']:

    response = industries_collection.query.hybrid(query=industry,
                                            query_properties=['industry'],
                                            fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
                                            target_vector='industry_vector',
                                            return_metadata=wvc.query.MetadataQuery(score=True),
                                            alpha=0.2,
                                            limit=10)
    
    # response = industries_collection.query.near_text(query=industry,
    #                                               target_vector='industry_vector',
    #                                               return_metadata=wvc.query.MetadataQuery(distance=True),
    #                                               limit=10)
    
    detected_industries[industry] = response.objects

for generated_industry, existing_industry in detected_industries.items():

    print('Genereated Industry:', generated_industry)
    for object in existing_industry:
        # print(object.properties['industry'], object.metadata.distance)
        print(object.properties['industry'], object.metadata.score)
    print()

        # print(f'{generated_industry}: {existing_industry.properties["industry"]}')
        # print(f'Score: {existing_industry.metadata.distance:.3f}')

Genereated Industry: Tech
Oil 0.20000000298023224
Construction 0.1998026967048645
Education 0.1995142102241516
Gas 0.19942273199558258
Air 0.19941894710063934
Restaurants 0.19940251111984253
Water 0.19927307963371277
Research 0.19926999509334564
Technology 0.19925904273986816
Health 0.19913141429424286

Genereated Industry: Finance
Construction 0.20000000298023224
Education 0.19979368150234222
Restaurants 0.1996726542711258
Technology 0.19963811337947845
Design 0.1995166540145874
Insurance 0.19947028160095215
Research 0.19944149255752563
Movies 0.19920296967029572
Health 0.1991831660270691
Oil 0.1991124451160431

