# Clean Text

In [2]:
import re

def clean_text(text):
    text = re.sub(r'<[^>]*?>', '', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = text.strip()
    text = ' '.join(text.split())
    return text

# JD

In [37]:
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers.json import JsonOutputParser
from langchain_core.runnables import RunnableLambda
# from utils.clean import clean_text
from templates.extractPattrens import jobDiscritionFormat


class JobDescriptionExtractor:
    def __init__(self, llm):
        self.llm = llm
        self.jsonParser = JsonOutputParser()

    def scrapeWebsite(self, url):
        loader = WebBaseLoader(web_path=url)
        response = loader.load()
        cleaned_response = clean_text(response[0].page_content)
        return cleaned_response

    def promptTemplate(self):
        return PromptTemplate.from_template(
            """
            ### SCRAPED TEXT FROM WEBSITE:
            {page_data}
            ### INSTRUCTION:
            The scraped text is from the career's page of a website.
            Your job is to extract the job postings and return them in following pattern {jobDiscritionFormat}
            Only return the valid JSON.
            ### VALID JSON (NO PREAMBLE):
            """
        )

    def extractJD(self, url: str):
        cleaned_response = self.scrapeWebsite(url)
        prompt_template = self.promptTemplate()
        json_parser = JsonOutputParser()

        chain = prompt_template | self.llm 
        jd_response = chain.invoke({
            "page_data": cleaned_response,
            "jobDiscritionFormat": jobDiscritionFormat
        })
        jd_response = self.jsonParser.parse(jd_response.content)
        return jd_response


# -----------------------> testing <-------------------------

import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
load_dotenv()
GROQ_API = os.getenv("GROQ")
llm = ChatGroq(temperature=0, groq_api_key=GROQ_API, model="llama-3.3-70b-versatile")
jd = JobDescriptionExtractor(llm)
jdRes = jd.extractJD("https://careers.nike.com/data-engineer/job/R-61137")
jdRes

{'role': 'Data Engineer',
 'experience': '2 years of experience in the job offered or in a data-related occupation',
 'skills': ['System development life cycle',
  'Cloud platforms such as AWS and Databricks',
  'SQL',
  'Version control and CI/CD pipelines',
  'Extracting, transforming, and loading and data pipelines',
  'Scripting and automation',
  'Big data technologies such as Hadoop and Spark',
  'Data Warehouse concepts and methodologies',
  'Relational and non-relational database design',
  'Programming languages such as Python, Java, and Scala'],
 'description': 'Design and build simple, reusable components of larger process or framework to support analytics products with mentorship from experienced peers, design and implement product features in collaboration with Business and Technology partners, clean, prepare, and optimize data at scale for ingestion and consumption, support the implementation of new data management projects and restructure of the current data architecture

# CreateChroma

In [54]:
import chromadb
import uuid
from itertools import chain


class ProjectVectorStore:
    def __init__(self, name, projects, jdSkills, numProjects = 3,storage_path="./chroma_storage"):
        self.projects = projects
        self.jdSkills = jdSkills
        self.numProjects = numProjects
        self.user_name = self._format_name(name)
        self.client = chromadb.PersistentClient(path=storage_path)
        self.collection = self.client.get_or_create_collection(name=f"projects_{self.user_name}")

    def _format_name(self, name: str) -> str:
        return name.lower().replace(" ", "_")
    
    def addDocuments(self):
        for project in self.projects:
            self.collection.add(
                documents = list(project["TechstackUsed"]),  
                metadatas = [{
                    "ProjectTitle": project["ProjectTitle"],
                    "description": project["description"],
                    "GitHub_Link": project["GitHub_Link"]
                    }] * len(project["TechstackUsed"]),
                    ids = [str(uuid.uuid4()) for _ in project["TechstackUsed"]])

    def _query(self):
        projectResponse = self.collection.query(query_texts = self.jdSkills)
        return projectResponse

    def retriveProjects(self):
        self.addDocuments()
        projectResponse = self._query()
        allProjects = list(chain.from_iterable(projectResponse["metadatas"]))
        seenProjectTitles = set()
        uniqueProjects = []
        for project in allProjects:
            title = project.get("ProjectTitle", "").strip()
            if title and title not in seenProjectTitles:
                seenProjectTitles.add(title)
                uniqueProjects.append({
                    "ProjectTitle": title,
                    "description": project.get("description", "").strip(),
                    "GitHub_Link": project.get("GitHub_Link", "").strip()
                })
            if len(uniqueProjects) == self.numProjects:
                break

        return uniqueProjects

# UserInformation

In [29]:
from langchain_community.document_loaders import PyPDFLoader
from templates.extractPattrens import resumeExtractFormat
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers.json import JsonOutputParser

class UserInformationExtractor:
    def __init__(self, llm):
        self.llm = llm
        self.prompt_template = PromptTemplate.from_template(
            """
            The Resume information is {page_content}
            You are a smart assistant, and I have provided you a text.
            Your task is to return me details in format of {resumeExtractFormat} and for discription include the summary of discription of that project with numerical results avilable.
            Return strictly in JSON format, and if anything is unknown placeholder is "None".
            If there are many projects, append them in a list of JSON. 
            NO PREAMBLE
            """
        )
        self.parser = JsonOutputParser()

    def extract_from_pdf(self, path: str) -> dict:
        loader = PyPDFLoader(path)
        pages = loader.load()
        page_content = pages[0].page_content
        return self._extract(page_content)

    def extract_from_text(self, text: str) -> dict:
        return self._extract(text)

    def _extract(self, page_content: str) -> dict:
        chain = self.prompt_template | self.llm
        response = chain.invoke({
            "page_content": page_content,
            "resumeExtractFormat": resumeExtractFormat
        })
        return self.parser.parse(response.content)


candidateInformationExtractrorPy = UserInformationExtractor(llm)
candidateInformation = candidateInformationExtractrorPy.extract_from_pdf("E:/Resumes/Specalized Resume.pdf") 
candidateInformation

{'BasicInfo': {'name': 'Gandluru Mohammed Yaseen',
  'linkedinLink': 'None',
  'github': 'https://github.com/yaseeng-md',
  'mail': 'gandlurumohammedyaseen@gmail.com',
  'mobile': '8328377285'},
 'Skills': {'ProgrammingLanguages': ['Python'],
  'FrameWorks': ['PyTorch', 'TensorFlow'],
  'Tools/Platform': ['Serper API',
   'BeautifulSoup',
   'Hugging Face models',
   'Linux'],
  'SoftSkills': ['None']},
 'Experience': [{'Designation': 'Intern',
   'Role': 'Data Valley',
   'Work': 'Implemented a Recommendation System for Movie Recommendations, Proposed solutions for problems using Machine Learning and Artificial Intelligence',
   'TechStack': ['Machine Learning', 'Artificial Intelligence'],
   'Duration': 'January 2024 – April 2024'}],
 'Projects': [{'ProjectTitle': 'Fine Tuning Llama Model for Website Article Summarization',
   'GitHub_Link': 'None',
   'description': 'Fine-tuning a LLaMA 2 7B model for automatic summarization of website articles, Utilizing the Serper API to retrieve 

In [62]:
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from utils.createChroma import ProjectVectorStore

import streamlit as st
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq

class Pipeline:
    def __init__(self, llm):
        self.llm = llm

    # -------------------> Extraction Methods <-------------------

    def extract_candidate_information(self, isPDF: bool, page_content: str = None, path: str = None):
        extractor = UserInformationExtractor(self.llm)

        if isPDF:
            if not path:
                raise ValueError("PDF path is missing.")
            candidateInformation =  extractor.extract_from_pdf(path)  # Sync version
        else:
            if not page_content:
                raise ValueError("Resume text is missing.")
            candidateInformation =  extractor.extract_from_text(page_content)
        # print(candidateInformation)
        return candidateInformation


    def extract_job_description(self, url: str):
        jd = JobDescriptionExtractor(self.llm)
        jdJson = jd.extractJD(url)
        # print(jdJson)
        return jdJson

    # -------------------> Chroma Vector DB <-------------------

    def createOrGetVDB(self, name, projects, jdSkills, numProjects = 3):
        chroma = ProjectVectorStore(name, projects, jdSkills,numProjects)
        uniqueProjects = chroma.retriveProjects()
        # print(uniqueProjects)
        return uniqueProjects

    # -------------------> Email Generation <-------------------

    def generateEmailPrompt(self):
        prompt = """
        ## Job Discription: {jdJson}
        ## Candidate Information
        Your name is {name} and your job is to write an cold email asking for Job asking. You have a previous experince as {experinece}.
        These are basic details {BasicInfo}, your projects are {projects}, and skills is {skills}.
        Express your projects at same place, and include github links if any.
        Include your qualifications, so that you can get a job. Maintian formal speech tone across mail. And add links to contact like LinkedinLink, GitHub and personal website if any. 
        ## NO PREAMBLE
        """

        return PromptTemplate.from_template(prompt)

    def generateEmail(self,jdJson,jsonCandidate,projects):
        mail_prompt = self.generateEmailPrompt()
        mailLLM = mail_prompt | self.llm
        mailRes = mailLLM.invoke(input={"jdJson":jdJson, "name" : jsonCandidate["BasicInfo"]["name"] ,"experinece" : jsonCandidate["Experience"], 
                     "BasicInfo" : jsonCandidate["BasicInfo"], "projects" : projects, "skills" : jsonCandidate["Skills"] }) 
        return mailRes.content


# load_dotenv()
# GROQ_API = os.getenv("GROQ")
# llm = ChatGroq(temperature=0, groq_api_key=GROQ_API, model="llama-3.3-70b-versatile")
# pipeline = Pipeline(llm)
# candidateInformation = pipeline.extract_candidate_information(isPDF=True,path = r"E:/Resumes/Specalized Resume.pdf")
# jdJson = pipeline.extract_job_description("https://careers.nike.com/data-engineer/job/R-61137")
# uniqueProjects = pipeline.createOrGetVDB(candidateInformation["BasicInfo"]["name"], candidateInformation["Projects"],jdJson["skills"])
# mailRes = pipeline.generateEmail(jdJson,candidateInformation,uniqueProjects)