# Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import chromadb
from typing import List, Dict

from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
from urllib.parse import urlparse, urljoin

def normalize_url(base_url, url):
    """Normalize a URL to its canonical form."""
    normalized_url = urljoin(base_url, url)  # Resolve relative URLs
    parsed_url = urlparse(normalized_url)
    # Remove fragment (e.g., #section1) and normalize path
    normalized_url = parsed_url._replace(fragment='').geturl()
    return normalized_url.rstrip('/').lower()  # Remove trailing slash and make lowercase

def is_valid_url(base_url, url):
    """Check if URL is valid, normalized, and within the same domain."""
    try:
        # Normalize the URL
        url = normalize_url(base_url, url)

        # Parse base and target URLs
        parsed_base = urlparse(base_url)
        parsed_url = urlparse(url)

        # Check for valid scheme and domain match
        is_same_domain = parsed_base.netloc == parsed_url.netloc
        is_valid_scheme = parsed_url.scheme in ('http', 'https')

        # Exclude non-HTML file types
        excluded_extensions = ['.pdf', '.jpg', '.png', '.gif', '.zip', '.exe', '.docx', '.mp4']
        has_valid_extension = not any(parsed_url.path.lower().endswith(ext) for ext in excluded_extensions)

        return is_valid_scheme and is_same_domain and has_valid_extension
    except Exception:
        return False

from hashlib import md5

def get_page_hash(content):
    """
    Generate a unique hash for the page content.
    
    Args:
        content (str): The text content of the page.

    Returns:
        str: MD5 hash of the content.
    """
    return md5(content.encode('utf-8')).hexdigest()

def crawl_website(base_url, max_pages=100):
    """Crawl the entire website and collect text content"""
    visited_urls = set()
    company_content = []

    def recursive_crawl(url):
        if (url in visited_urls or 
            len(visited_urls) >= max_pages):
            return
    
        try:
            # Normalize URL
            url = normalize_url(base_url, url)
    
            # Avoid revisiting normalized URLs
            if url in visited_urls:
                return
    
            visited_urls.add(url)
    
            # Fetch page content
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
    
            # Extract text content
            page_text = soup.get_text(separator=' ', strip=True)
            content_hash = get_page_hash(page_text)
            if content_hash not in visited_urls:
                visited_urls.add(content_hash)
                company_content.append({
                    'url': url,
                    'content': page_text
                })
    
            # Find and crawl subpages
            for link in soup.find_all('a', href=True):
                full_url = urljoin(base_url, link['href'])
                full_url = normalize_url(base_url, full_url)
                if is_valid_url(base_url, full_url):
                    recursive_crawl(full_url)
    
        except Exception as e:
            print(f"Error crawling {url}: {e}")

    # Start crawling from base URL
    recursive_crawl(base_url)
    return company_content

In [3]:
def clean_text(text: str) -> str:
    """
    Clean text by removing special characters and formatting artifacts
    """
    import re

    cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Step 2: Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Step 3: Optionally trim leading/trailing spaces
    cleaned_text = cleaned_text.strip()

    # Remove non-breaking spaces
    cleaned_text = cleaned_text.replace('\xa0', ' ')
    
    # Remove copyright symbols
    cleaned_text = cleaned_text.replace('Â©', '©')
    
    # Remove multiple spaces
    cleaned_text = ' '.join(cleaned_text.split())
    
    # Remove any remaining control characters
    cleaned_text = ''.join(char for char in cleaned_text if ord(char) >= 32 or char == '\n')
    
    return cleaned_text

# Create Embeddings and store in Vector Database

In [4]:
def _index_documents(documents:List[str], client, collection):
    """
    Convert documents to embeddings and store in vector database
    """
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    for i, doc in enumerate(documents):
        # Generate embedding
        embedding = embedding_model.encode(doc).tolist()
        
        # Store in vector database
        collection.add(
            embeddings=embedding,
            documents=[doc],
            ids=[f"doc_{i}"]
        )

# Ingest Resume

In [5]:
import pdfplumber

# Open the PDF file
def read_resume(path):
    with pdfplumber.open(path) as pdf:
        # Extract text from all pages
        full_text = ""
        for page in pdf.pages:
            # Extract text with improved precision
            page_text = page.extract_text()
            full_text += page_text + "\n"
    
    return full_text

In [6]:
import re

def extract_bullet_points(resume_text):
    # Split the text into lines
    lines = resume_text.split('\n')
    
    # List to store complete bullet points
    bullet_points = []
    
    # Temporary storage for multi-line bullets
    current_bullet = None
    
    for line in lines:
        # Check if line starts with a bullet
        if re.match(r'^[\-\*•●]\s|\d+\.\s', line.strip()):
            # If we were building a previous multi-line bullet, add it first
            if current_bullet:
                bullet_points.append(' '.join(current_bullet))
            
            # Start a new bullet point
            current_bullet = [line.strip()]
        
        # Check for continuation of a bullet point (indented or without a new bullet)
        elif current_bullet is not None and (line.strip() and not re.match(r'^[\-\*•●]\s|\d+\.\s', line.strip())):
            # Append to current bullet point
            current_bullet.append(line.strip())
        
        # If line is empty, reset current bullet
        elif line.strip() == '':
            # Add any ongoing bullet point
            if current_bullet:
                bullet_points.append(' '.join(current_bullet))
                current_bullet = None
    
    # Add the last bullet point if exists
    if current_bullet:
        bullet_points.append(' '.join(current_bullet))
    
    return bullet_points

In [8]:
import chromadb
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from typing import List

class StrictContextGenerator:
    def __init__(self, openai_api_key: str, vector_db_collection):
        """
        Initialize context generator with strict document retrieval
        
        Args:
            openai_api_key (str): OpenAI API key
            vector_db_collection: ChromaDB collection of embedded documents
        """
        self.client_ai = OpenAI(api_key=openai_api_key)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.collection = vector_db_collection
        
    def retrieve_relevant_context(self, query: str, top_k: int = 3) -> List[str]:
        """
        Retrieve most relevant documents based on both explicit and implicit connections
        
        Args:
            query (str): Query to find relevant context
            top_k (int): Number of top documents to retrieve
        
        Returns:
            List of most relevant document contexts combining explicit and implicit matches
        """
        # Generate two query perspectives
        explicit_query = query  # Original query for exact/direct matches
        implicit_query = f"Skills, methods, or approaches related to: {query}"  # For conceptual connections
        
        # Create embeddings for both queries
        explicit_embedding = self.embedding_model.encode(explicit_query).tolist()
        implicit_embedding = self.embedding_model.encode(implicit_query).tolist()
        
        # Perform semantic search for both perspectives
        explicit_results = self.collection.query(
            query_embeddings=explicit_embedding,
            n_results=top_k
        )
        
        implicit_results = self.collection.query(
            query_embeddings=implicit_embedding,
            n_results=top_k
        )
        
        # Combine and deduplicate results
        all_docs = []
        if explicit_results['documents']:
            all_docs.extend(explicit_results['documents'][0])
        if implicit_results['documents']:
            all_docs.extend(implicit_results['documents'][0])
            
        # Remove duplicates while preserving order
        unique_docs = list(dict.fromkeys(all_docs))
        
        # Return combined results up to requested number
        return unique_docs[:top_k]
        
    def generate_grounded_context(self, 
                                  query: str, 
                                  retrieved_docs: List[str], 
                                  instruction: str) -> str:
        """
        Generate context using only retrieved documents
        
        Args:
            query (str): Original query or context request
            retrieved_docs (List[str]): Relevant retrieved documents
            instruction (str): Specific generation instruction
        
        Returns:
            Context generated strictly from retrieved documents
        """
    
        # If no documents retrieved, return a cautionary message
        if not retrieved_docs:
            return "Insufficient contextual information to generate a detailed response."
        
        # Prepare prompt with strict grounding
        prompt = f"""
        IMPORTANT INSTRUCTIONS:
        - Use ONLY the information from the provided documents
        - Do NOT add any information not present in the documents
        - If information is missing, state that clearly
        
        Context Documents:
        {chr(10).join([f"Document {i+1}: {doc}" for i, doc in enumerate(retrieved_docs)])}
        
        {instruction}
        
        Query: {query}
        
        Response Guidelines:
        1. Directly quote or paraphrase from the documents
        2. Clearly indicate the source of each piece of information
        3. If no direct information is available, state "No specific information found"
        """
        
        try:
            response = self.client_ai.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a precise document analyst. Only use information from the provided documents."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=500  # Limit response length
            )
            
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error generating grounded context: {e}")
            return f"Error generating context: Unable to process documents. {str(e)}"
    
    def generate_job_seeker_context(self, 
                                    resume: str, 
                                    job_description: str, 
                                    company_context: str) -> str:
        """
        Generate job seeker context using both provided company context and retrieved documents
        
        Args:
            resume (str): Candidate's resume
            job_description (str): Job description
            company_context (str): Additional company context
        
        Returns:
            Contextualized narrative for job seeker
        """
        # Retrieve relevant company context
        query = f"Company information relevant to {job_description} and {company_context}"
        retrieved_docs = self.retrieve_relevant_context(query)
        
        # Add the provided company context to retrieved documents
        if company_context:
            retrieved_docs.insert(0, company_context)
        
        # Generate context with strict document grounding
        context_instruction = """
        Analyze how the candidate's resume matches the job description 
        using the provided company documents.
        
        For each skill or experience, identify:
        - Direct references in company documents
        - Potential alignment with company needs
        - Limitations in context (if any)
        """
        
        return self.generate_grounded_context(
            query=job_description,
            retrieved_docs=retrieved_docs,
            instruction=context_instruction
        )
    
    def generate_recruiter_context(self, 
                                   resume_line: str, 
                                   job_description: str) -> str:
        """
        Generate recruiter context using both provided company context and retrieved documents
        
        Args:
            resume_line (str): Specific line from resume
            job_description (str): Job description
            company_context (str): Additional company context
        
        Returns:
            Detailed contextual breakdown
        """
        # Retrieve relevant company context
        query = f"Company information related to {resume_line}"
        #{job_description} and
        retrieved_docs = self.retrieve_relevant_context(query)
        
        # Add the provided company context to retrieved documents
        #if company_context:
        #   retrieved_docs.insert(0, company_context)
        
        # Generate context with strict document grounding
        context_instruction = """
        Analyze this resume achievement in the context of the job and company:
        
        For each aspect of the resume line, identify:
        - Direct supporting evidence from company documents
        - Potential relevance to company needs
        - Any gaps in contextual information
        - Explicitly quote or reference source documents
        """
        
        return self.generate_grounded_context(
            query=resume_line,
            retrieved_docs=retrieved_docs,
            instruction=context_instruction
        )

In [9]:
client = chromadb.Client()
existing_collections = client.list_collections()

# Check if "company_docs" collection exists
collection_exists = any(collection.name == "company_docs" for collection in existing_collections)

if collection_exists:
    # Collection already exists, so just get the existing collection
    collection = client.get_collection("company_docs")
else:
    # Create the collection
    collection = client.create_collection("company_docs")

In [10]:
JOB_DESCRIPTION = """Exowatt is a next generation renewable energy company offering commercial and industrial customers a modular full-stack energy solution that can provide dispatchable power and heat for up to 24 hours per day, specifically designed to meet the needs of energy-intensive applications such as data centers. Our mission is clear, to make sustainable renewable energy always available and almost free. Exowatt is based in Miami, Florida and backed by notable investors such as a16z, Atomic, and Sam Altman.

We are looking for a driven, self-starter fullstack engineer to help us build out customer facing interfaces and build out the backend firmware. You will work closely with our small team of Mechanical, Thermal, and Optics engineers and have significant inputs to the product success.

What You'll Do:
In your first six months on the job you’ll have:
- Understood the company’s vision, goals, business model, and the challenges that our customers face.
- Worked closely with our pilot users to solve their problems through elegant technical solutions.
- Defined the medium-term technical strategy with a keen focus on incorporating business needs, user feedback, and the inherent flexibility required in an early stage-startup.
- Led the release of the minimum viable product by working throughout the stack - backend, frontend, and infrastructure, with an emphasis on firmware and front end development.
- Developed a data engineering strategy to synthesize data from disparate sources into an analysis ready state.
- Made key improvements to our underlying infrastructure, and laid out the medium term architecture roadmap.
- Helped recruit additional technical team members to form a world-class, collaborative team.
- Established technical culture by setting best practices and norms.
- Led the release of subsequent versions of the product, with a focus on third-party data integrations, to unblock new business and enable growth.

What You Have:
- 3-5+ yrs of total full time software development experience.
- 1-2+ yrs of experience with firmware development.
- Experience with scripting languages (ex. Python) and automation tools.
- Experience with integration and tooling software.
- Familiarity with cloud-based computing platforms (ex. AWS, Google Cloud).
- Experience with multiple database management systems.
- Strong problem-solving, debugging, and analytical skills with great attention to detail.
- 0->1 startup experience, ideally with a Pre-seed / Seed / Series A company
$140,000 - $180,000 a year
We are focused on building a diverse and inclusive workforce. If you’re excited about this role, but do not meet 100% of the qualifications listed above, we encourage you to apply.
-----
Exowatt is an Equal Opportunity Employer and considers applicants for employment without regard to race, color, religion, sex, orientation, national origin, age, disability, genetics or any other basis forbidden under federal, state, or local law. Exowatt considers all qualified applicants in accordance with the San Francisco Fair Chance Ordinance.

Please review our CCPA policies here.

"""

In [11]:
import os
def main():
    # Example usage
    # Replace with actual values
    #OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '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')
    #COMPANY_URL = 'https://example.com'
    #RESUME_PATH = 'path/to/resume.pdf'
    #JOB_DESCRIPTION = 'Detailed job description here'

    # Crawl website
    url = "https://www.exowatt.com/"

    content = crawl_website(url)
    content = [clean_text(c['content']) for c in content]

    #all_content = [doc['content'] for doc in content]

    # Index documents
    
    _index_documents(content, client, collection)

    # Process resume
    path = "/Users/Tapan/Documents/Dangarwala_Tapan_Resume_cs.pdf"
    all_resume_text = read_resume(path)
    bullet_points = extract_bullet_points(all_resume_text)

    # Generate contexts
    context_generator = StrictContextGenerator(
        openai_api_key='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',
        vector_db_collection=collection)
    
    # Job seeker context
    #job_seeker_context = context_generator.generate_job_seeker_context(
    #    bullet_points[0][2:], 
    #    JOB_DESCRIPTION, 
    #    all_content
    #)
    #print("Job Seeker Context:", job_seeker_context)

    # Recruiter context (using first bullet point)
    if bullet_points:
        for b in bullet_points:
            recruiter_context = context_generator.generate_recruiter_context(
                b,
                JOB_DESCRIPTION
            )
            print(b)
            print("Recruiter Context:", recruiter_context)


In [12]:
main()

docs found
["Terms - Exowatt Home About Main pages Sales Home V1 Home V2 Home V3 About Services Service single Products Product single Contact V1 Contact V2 Contact V3 Blog V1 Blog V2 Blog V3 Blog post Coming soon More Webflow Template Utility pages Style guide Start here 404 not found Password protected Licenses Changelog Contact Us Your Cart $ 0.00 USD : Remove Subtotal Pay with browser. Continue to Checkout No items found. Go to shop Product is not available in this quantity. Reserve Now Privacy Policy and Terms of Use Privacy Policy Privacy Policy Privacy Policy of Exowatt, Inc. Last updated: 4/20/2024 . Introduction Welcome to Exowatt, Inc. We respect your privacy and are committed to protecting your personal data. This Privacy Policy explains how we collect, use, and share information from or about you when you visit our website. 2. Information We CollectPersonal Data: When you fill out our contact form, we collect the personal information you provide, such as your name, email ad