In [8]:
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
from weaviate.connect import ConnectionParams
from weaviate.util import generate_uuid5
import ollama
import os
import pickle
import json

In [3]:
client = weaviate.connect_to_local(additional_config=wvc.init.AdditionalConfig(timeout=(60, 7500)))

In [5]:
with open('../data/jobs.json') as f:
    jobs = json.load(f)

len(jobs)

87178

In [6]:
jobs[0]

{'job_id': '1218575',
 'company_name': "Children's Nebraska",
 'title': 'Respiratory Therapist',
 'description': "At Children’s, the region’s only full-service pediatric healthcare center, our people make us the very best for kids. Come cultivate your passion, purpose and professional development in an environment of excellence and inclusion, where team members are supported and deeply valued. Opportunities for career growth abound as we grow our services and spaces, including the cutting-edge Hubbard Center for Children. Join our highly engaged, caring team—and join us in providing brighter, healthier tomorrows for the children we serve. Children's is committed to diversity and inclusion. We are an equal opportunity employer including veterans and people with disabilities.\nA Brief OverviewProvides appropriate respiratory care specific to the pediatric population in accordance with the hospital policy/procedure. Assesses, plans and implements appropriate respiratory plan of care based

In [7]:
if client.collections.exists('Jobs'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Jobs')
    
jobs_collection = client.collections.create(name='Jobs',
                                            description='various job postings',
                                            vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
                                                                                                                 model='nomic-embed-text',
                                                                                                                 vector_index_config=wvc.config.Configure.VectorIndex.hnsw(quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(),
                                                                                                                                                                           distance_metric=wvc.config.VectorDistances.L2_SQUARED),
                                                                                                                 name='description_vector', 
                                                                                                                 source_properties=['description', 'skills', 'industry', 'title'], 
                                                                                                                 vectorize_collection_name=False)],
                                            generative_config=wvc.config.Configure.Generative.ollama(api_endpoint='http://host.docker.internal:11434', model='llama3'),
                                            reranker_config=wvc.config.Configure.Reranker.cohere(model='rerank-english-v3.0'),
                                            properties=[wvc.config.Property(name='job_id', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='company_name', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='title', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='description', data_type=wvc.config.DataType.TEXT, tokenization=wvc.config.Tokenization.WHITESPACE),
                                                        wvc.config.Property(name='location', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='type', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='remote', data_type=wvc.config.DataType.BOOL),
                                                        wvc.config.Property(name='skills', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='industry', data_type=wvc.config.DataType.TEXT),
                                                        wvc.config.Property(name='application_url', data_type=wvc.config.DataType.TEXT)])

with jobs_collection.batch.dynamic() as batch:
    for job in jobs:
        obj_uuid = generate_uuid5(job)
        batch.add_object(properties=job,
                         uuid=obj_uuid)
print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [19]:
with open('../data/roles.pkl', 'rb') as f:
    roles = pickle.load(f)
roles = [{'role': role} for role in roles]

In [20]:
roles[0]

{'role': 'QA Tester'}

In [21]:
if client.collections.exists('Roles'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Roles')
    
roles_collection = client.collections.create(name='Roles',
                                            description='Distinct roles',
                                            vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
                                                                                                                 model='nomic-embed-text',
                                                                                                                 vector_index_config=wvc.config.Configure.VectorIndex.flat(quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(),
                                                                                                                                                                           distance_metric=wvc.config.VectorDistances.L2_SQUARED),
                                                                                                                 name='role_vector', 
                                                                                                                 source_properties=['role'], 
                                                                                                                 vectorize_collection_name=False)],
                                            properties=[wvc.config.Property(name='role', data_type=wvc.config.DataType.TEXT)])

roles_collection.data.insert_many(roles)
print('Inserted data!')

Dropping pre-exisiting collection
Inserted data!


In [22]:
with open('../data/industries.pkl', 'rb') as f:
    industries = pickle.load(f)
industries = [{'industry': industry} for industry in industries]
industries[0]

{'industry': 'Medical Equipment Manufacturing'}

In [23]:
if client.collections.exists('Industries'):
    print('Dropping pre-exisiting collection')
    client.collections.delete('Industries')
    
industries_collection = client.collections.create(name='Industries',
                                            description='Distinct Industries',
                                            vectorizer_config=[wvc.config.Configure.NamedVectors.text2vec_ollama(api_endpoint='http://host.docker.internal:11434',
                                                                                                                 model='nomic-embed-text',
                                                                                                                 vector_index_config=wvc.config.Configure.VectorIndex.flat(quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(),
                                                                                                                                                                           distance_metric=wvc.config.VectorDistances.L2_SQUARED),
                                                                                                                 name='industry_vector', 
                                                                                                                 source_properties=['industry'], 
                                                                                                                 vectorize_collection_name=False)],
                                            properties=[wvc.config.Property(name='industry', data_type=wvc.config.DataType.TEXT)])

industries_collection.data.insert_many(industries)
print('Inserted data!')

Inserted data!


In [30]:
with open('data/extracted_summaries.json') as f:
    summaries = json.load(f)

In [31]:
i = 2
summaries[i]

{'skills': ['Python',
  'C++',
  'Java',
  'R',
  'SQL',
  'Keras',
  'PyTorch',
  'Tensorflow',
  'Flask',
  'FastAPI',
  'AWS',
  'Azure',
  'Docker',
  'NATS',
  'Git',
  'Jira'],
 'location': 'Flexible',
 'role_type': 'Flexible',
 'interested_roles': ['Machine Learning Engineer'],
 'industries': ['Healthcare', 'EdTech', 'Supply Chain'],
 'team_fit': 'I would excel at a team that values innovation, collaboration, and open communication. A team that is passionate about leveraging AI to make a positive impact in society and is willing to take calculated risks to drive progress.',
 'summary': 'As a Machine Learning Engineer at Chegg, I apply my passion and expertise in Generative AI, Natural Language Processing, and Deep Learning to develop state-of-the-art AI applications that empower students and educators. I have built deep learning models for question answering, topic prediction, and author extraction, leveraging LLMs, CNNs, and traditional ML models. I am currently working on Cheg

In [46]:
query = summaries[i]['summary']
roles = summaries[i]['interested_roles']

response = jobs_collection.query.hybrid(query=query,
                                        query_properties=['description^2', 'skills', 'industry'],
                                        fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
                                        target_vector='description_vector',
                                        vector=wvc.query.HybridVector.near_text(query=query, move_to=wvc.query.Move(force=0.5, concepts=roles)),
                                        return_metadata=wvc.query.MetadataQuery(score=True),
                                        alpha=0.6,
                                        limit=50,
                                        auto_limit=4,
                                        rerank=wvc.query.Rerank(prop='description', query=query)
                                        )

print('QUERY:\n',query)
print('\nRETRIEVAL:')

for object in response.objects[:5]:
    display(object.properties['title'])
    display(object.properties['description'])
    print(f'Score: {object.metadata.score:.3f}')

QUERY:
 As a Machine Learning Engineer at Chegg, I apply my passion and expertise in Generative AI, Natural Language Processing, and Deep Learning to develop state-of-the-art AI applications that empower students and educators. I have built deep learning models for question answering, topic prediction, and author extraction, leveraging LLMs, CNNs, and traditional ML models. I am currently working on CheggMate, a revolutionary AI-powered study assistant that powers student learning experiences and helps them master any subject. I have a Master of Science in Computer Science, specializing in Artificial Intelligence, from Northeastern University, where I gained a solid foundation in Deep Learning, Machine Learning, Computer Vision, and Natural Language Processing. I also have 2.5 years of experience in Research and Applied AI in Healthcare, EdTech, and supply chain, working with fast-paced startups and prominent Big Tech corporations. I have strong skills in Python, C++, Java, R, SQL, Ker

' Sr GenAI / Machine Learning Engineer'

"Responsibilities:\nDesign and implement machine learning models and algorithms to address business challenges, with a focus on Generative AI applications.Collaborate with data scientists and domain experts to understand requirements, gather data, and develop solutions that meet business objectives.Develop and maintain scalable and efficient codebase for training, testing, and deploying machine learning models in production environments.Conduct research and experiment with new techniques and algorithms to improve model performance and accuracy.Work closely with software engineers and DevOps teams to integrate machine learning models into existing systems and infrastructure.Collaborate with stakeholders to understand feedback and iterate on machine learning models to continuously improve performance and user experience.Mentor junior team members and provide technical guidance and support on machine learning best practices and methodologies.Stay up-to-date with the latest advancements in

Score: 0.499


'Lead Machine Learning Engineer'

"Who We Are\n\nWe are a small deep-tech startup with specialized capacities in behavior change solutions, offering a proven combination of behavioral psychology, data analytics, and digital communications. Our company focuses effectively on creating science-based solutions that tend to embrace social purpose and enhance the quality of people’s lives by creating a variety of products that use cutting-edge machine learning and data science methods to model, segment, and create the products.\n\nWe are seeking a Lead Machine Learning Engineer interested in solving the most challenging problems leveraging Machine Learning, especially Natural Language Processing (NLP) and Natural Language Understanding (NLU), and information retrieval including topical classification, sentiment analysis, and behaviors (user intent detection). Looking for people curious and love problem-solving. You have to like building end-to-end products that have a focus on ethics and reliability.\n\nWe are looking for Se

Score: 0.127


'Machine Learning Engineer'

'Machine Learning Engineer\nWE’RE LOOKING FOR A MACHINE LEARNING ENGINEER TO BECOME AN INSTRUMENTAL PART OF OUR NEXT PHASE OF GROWTH!\nMachine Learning EngineerHybrid in Austin$200,000 - $250,000 Depending on experienceTo apply please email/ jack.crowley@searchability.com\nWe are a leading innovator in the AI sector, working on making machines as advanced as possible!\nWHO ARE WE?We are a high tech AI company who are shaping the way machines learn and interact with humans - If you are looking to join an exciting company, get in touch!\nWHAT WILL YOU BE DOING?\nYou will be working on our flagship products - Assisting in the rapid deployment of products that appeal to the market we are in, with the challenge of then scaling it afterwards. This role will give you the chance to work closely with our CTO and be a vital part of our growing tech team.\nWE NEED YOU TO HAVE….Solid Python ExperienceWork professionally with PyTorchSolid C++ experience\nIT’S NICE TO HAVE….NLP/AI and ML experienceO

Score: 0.197


'Sr. GenAI Engineer'

'\nHi,Greetings of the day!I have an urgent requirement for you with one of my clients. I am sharing the job description with you, please go through it and reply to me with your updated resume if you find it interesting.Position :- Sr. GenAI EngineerLocation :- Irving Texas (Day 1 onsite)Fulltime RolesDesigning, developing, and implementing generative AI models and algorithms utilizing state-of-the-art techniques such as GPT, VAE, and GANsColaborating with cross-functional team to define the AI/Gen I requirements.Optimizing the existing GenAI models for performance improvement, scalability, and efficiencyDevelop and maintain the AI Pipeline that includes data processing, feature extraction, model training and evaluation.Collaboration with software engineering and operations teams to ensure seamless integration and deployment of AI models.Develop the documentation like technical specification, user guides, technical architecture, etc.SkillsBachelor’s or master’s degree in computer scien

Score: 0.143


'Artificial Intelligence Engineer'

'Position Summary: \nTitle: Architect Premium III – AI EngineerDuration: 3 Months - Long TermLocation: Washington, DC 20433\nHybrid Onsite: 4 Days per week from Day1. \nHM Comments: We are seeking an architect who is an expert in AI and Machine Learning areas.\nRoles and Responsibilities for AI Engineer:The AI engineer will be responsible for designing, developing, and deploying AI models based on training data sets or using generative AI. The role will focus on leveraging Azure Cloud services to build enterprise-level solutions that meet the specific needs of the organization.\nKey Responsibilities:Develop and implement AI models and algorithms.Ability to build application including front end to show finished product.Design and develop software applications that integrate AI technologies, including Generative AI, machine learning, and natural language processing.Collaborate with data scientists and other stakeholders to identify business requirements and develop solutions that meet th

Score: 0.220
