In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline, AutoModelForSeq2SeqLM, TextStreamer
import torch
import intel_extension_for_pytorch as ipex
import ast
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
import re



def generate_tag(pipe,resume):

    prompt = f""" <s> [INST]

    You're an LLM who's role is to read the content of the resume, and provide job titles related to their resume that you can generate from that. Eg. [Machine Learning Engineer, Software development Engineer, Backend Engineer] which can be used as filters for job-hunting. Your response should strictly follow the syntax of a python list.


    Resume_content: {resume} [/INST]

    """



    # print("Generating response...")
    output = pipe(prompt)[0]["generated_text"]

    response_index = output.find("/INST]")

    response = output[response_index+6:]

    pattern = r'(?<=\[)(.*?)(?=\])'

    # Use regex to find the list
    matches = re.search(pattern, response)
    
    if matches:
        output = matches.group(0)
        print(output)

    response = f"[{output}]"

    return response


def validate_jobs(pipe,resume,jobs_json,preferences):


    output_format = {"jobs":[
     
              {
                "title": "Job Title",
                "company": "Company name",
                "location": "Location",
                "application_link": "The link that is parsed."
            },
     
    ]}

    
    prompt = f"""<s>[INST]

    You're an LLM who's role is to recieve a list of available jobs, a list of preferences and a resume as input, and your output will be a JSON RESPONSE of the top 10 most relevant jobs among the given jobs and should contain absolutely no texts other than that, based on the contents of the resume and user's preferences. The output should strictly follow the same format as the input json.

    Resume_content: {resume}, jobs: {jobs_json}, preferences: {preferences} [/INST]

"""


    output = pipe(prompt)[0]["generated_text"]

    return output
    

def get_linkedin(url):

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    job_cards = soup.find_all("div", class_="base-search-card__info")

    all_job_details = []

    # Iterate over each job card
    for card in job_cards:
        # Find the job details
        job_title = card.find("h3", class_="base-search-card__title").text.strip()
        # print(f"job_title: {job_title}")
        try:
            company_name = card.find("a", class_="hidden-nested-link").text.strip()
        except:
            company_name="Unavailable"
        location = card.find("span", class_="job-search-card__location").text.strip()
        salary_info = card.find("span", class_="job-search-card__salary-info")

        if salary_info:
            salary_info = salary_info.text.strip()
        else:
            salary_info="Salary Info not available"
        try:
            apply_link = card.find("a", class_="hidden-nested-link")["href"]
        except:
            apply_link="unavailable"
        

        # Create a dictionary to store the data for this job
        job_details = {
            "job_title": job_title,
            "company_name": company_name,
            "location": location,
            "salary_info": salary_info,
            "Link":apply_link
        }

        # Append the job details to the list
        all_job_details.append(job_details)

    return all_job_details

def create_search_querries(tags:str):

    tags = tags.replace('\n','')

    tags = tags.strip()
    
    tags_list = ast.literal_eval(tags)


    jobs = []

    for tag in tags_list[:5]:

        linkedin_keywords = '%20'+tag.replace(' ','%20')


        linkedin_query = f"https://www.linkedin.com/jobs/search?keywords={linkedin_keywords}&location=US&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum={0}"

        jobs.append(get_linkedin(linkedin_query))

    internshala_keywords = ''
    for tag in tags_list:
        internshala_keywords+=tag.replace(' ','-')+','

    internshala_keywords= internshala_keywords[:-1]

    internshala_search_query = f"https://internshala.com/internships/{internshala_keywords}-internship"


    return jobs


def get_internshala(url):

    response = requests.get(url)

    soup = BeautifulSoup(response.content, "html.parser")

    internship_containers = soup.find_all("div", class_="internship_meta")

    all_internship_details = []

    for container in internship_containers:
        # Find the internship details
        internship_name = container.find("h3", class_="heading_4_5 profile").text.strip()
        company_name = container.find("a", class_="link_display_like_text view_detail_button").text.strip()
        location = container.find("a", class_="location_link view_detail_button").text.strip()
        start_date = container.find("div", class_="item_body").text.strip()
        duration = container.find_all("div", class_="item_body")[1].text.strip()
        stipend = container.find("span", class_="stipend").text.strip()
        applying_link = f"""https://internshala.com{soup.find("a", class_="view_detail_button")['href']}"""

        # Create a dictionary to store the data for this internship
        internship_details = {
            "Internship Name": internship_name,
            "Company Name": company_name,
            "Location": location,
            "Start Date": start_date,
            "Duration": duration,
            "Stipend": stipend,
            "Applying Link": applying_link
        }

        # Append the internship details to the list
        all_internship_details.append(internship_details)

    return all_internship_details


def get_results(resume_content,preferences):


    tag_generator_path = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"

    # tag_generator_path = "HuggingFaceH4/mistral-7b-grok"

    tag_generator = AutoModelForCausalLM.from_pretrained(
        tag_generator_path,
        low_cpu_mem_usage=True,
        device_map="auto"
    )

    tag_generator = ipex.optimize(tag_generator,dtype=torch.float16)

    tokenizer = AutoTokenizer.from_pretrained(tag_generator_path)

    tag_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_params = {
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.95,
        "top_k": 40,
        "max_new_tokens": 2048,
        "repetition_penalty": 1.1
    }

    tag_pipe = pipeline(
        "text-generation",
        model=tag_generator,
        tokenizer=tokenizer,
        streamer=tag_streamer,
        **generation_params
    )

    

    # validator_path = "mistralai/Mixtral-8x7B-Instruct-v0.1"
        
    # validator = AutoModelForCausalLM.from_pretrained(
    #     validator_path,
    #     low_cpu_mem_usage=True,
    #     device_map="auto"
    # )
    # # validator = ipex.optimize(validator, dtype=torch.float16)
    # tokenizer = AutoTokenizer.from_pretrained(validator_path)
    
    # val_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # generation_params = {
    #     "do_sample": True,
    #     "temperature": 0.7,
    #     "top_p": 0.95,
    #     "top_k": 40,
    #     "max_new_tokens": 1000,
    #     "repetition_penalty": 1.1
    # }
    
    # val_pipe = pipeline(
    #     "text-generation",
    #     model=validator,
    #     tokenizer=tokenizer,
    #     streamer=val_streamer,
    #     **generation_params
    # )


    jobs = (create_search_querries(generate_tag(tag_pipe,resume_content)))

    print("jobs created")
    
    jobs_list = []

    for i in jobs:
        if len(i)>1:
            for j in i:
                jobs_list.append(j)


    return jobs_list

In [3]:
!echo "List of Intel GPUs available on the system:"
!xpu-smi  discovery 2> /dev/null
!echo "Intel Xeon CPU used by this notebook:"
!lscpu | grep "Model name"

List of Intel GPUs available on the system:
+-----------+--------------------------------------------------------------------------------------+
| Device ID | Device Information                                                                   |
+-----------+--------------------------------------------------------------------------------------+
| 0         | Device Name: Intel(R) Data Center GPU Max 1100                                       |
|           | Vendor Name: Intel(R) Corporation                                                    |
|           | SOC UUID: 00000000-0000-0029-0000-002f0bda8086                                       |
|           | PCI BDF Address: 0000:29:00.0                                                        |
|           | DRM Device: /dev/dri/card0                                                           |
|           | Function Type: physical                                                              |
+-----------+----------------------------------

In [2]:
resume_content = """

Summary: Summary: Motivated undergraduate in Artificial Intelligence and Data Science, with aspirations to excel as a Machine 
Learning Engineer and Software Developer. Possessing expertise in Computer Vision, Generative AI, and 
Natural Language Processing, I am actively pursuing internships to apply this knowledge in real-world 
settings and make meaningful contributions to the domains of AI and software development.

Languages:
Python
JavaScript
HTML
SQL

Data Analytics Tools:
Numpy
Pandas
Matplotlib
Seaborn
Hadoop


Domain Skills:
Computer Vision
NLP
Image Processing
GANs


Frameworks:
Pytorch
Tensorflow
Scikit-Learn
Django
React.js


Experience: Machine Learning Research Intern NIT Trichy 

Conducted a research-based internship focused on optimizing Generative Adversarial Networks 
(GANs) to enhance data augmentation efficiency, resulting in reduced training time and improved 
accuracy, culminating in the publication of a groundbreaking research paper.
• Collaborated with fellow interns to analyze potential obstacles in training GANs for data augmentation 
purposes, and successfully devised and implemented algorithms to mitigate and overcome these 
challenges.
• Developed a Machine Learning Pipeline Python using PyTorch and scikit-learn to detect image 
forgeries, successfully identifying and highlighting suspicious areas in images through computer vision 
techniques and neural networks.


Projects:

Crime Hotspot Map
A Machine Learning project on 
predicting the likeliness of user 
being a victim of crime based on 
their current location using the K-
Nearest Neighbour Regressor 
algorithm, integrated with Leaflet 
API and Django Framework.


Brain Tumour Detection on MRIs
Designed and deployed a web-
based diagnostic system using 
Convolutional Neural Networks 
(CNNs) and TensorFlow, enabling 
accurate detection and 
classification of brain tumours in 
MRI scan images, with the
application built on the Django 
framework.


Image to Prompt
A CNN and LSTM based model
developed using Pytorch to 
generate descriptive prompts from 
images, inverting the Stable 
Diffusion Image generation 
approach.

"""

preferences = "I would prefer a development role over a research role"

In [17]:
print(get_results(resume_content,preferences))

2024-03-09 20:48:55,085 - accelerate.utils.modeling - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["Machine Learning Researcher", "Generative Adversarial Networks Optimizer", "Data Analyst (Python, Numpy, Pandas, Matplotlib, Seaborn, Hadoop)", "Computer Vision Engineer", "Natural Language Processing Engineer", "Deep Learning Engineer (PyTorch, TensorFlow, Scikit-Learn)", "Full Stack Developer (Django, React.js)" ]
"Machine Learning Researcher", "Generative Adversarial Networks Optimizer", "Data Analyst (Python, Numpy, Pandas, Matplotlib, Seaborn, Hadoop)", "Computer Vision Engineer", "Natural Language Processing Engineer", "Deep Learning Engineer (PyTorch, TensorFlow, Scikit-Learn)", "Full Stack Developer (Django, React.js)" 
jobs created
[{'job_title': 'AI/ML Developer', 'company_name': 'PETADATA', 'location': 'San Francisco, CA', 'salary_info': 'Salary Info not available', 'Link': 'https://www.linkedin.com/company/petadata?trk=public_jobs_jserp-result_job-search-card-subtitle'}, {'job_title': 'AI Software Developer', 'company_name': 'Unreal Staffing, Inc', 'location': 'San Franci