In [1]:
pip install requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests
from bs4 import BeautifulSoup
import json

# Sample URLs and API keys for different job platforms
API_KEYS = {
    'example_platform': 'YOUR_API_KEY_HERE'
}

JOB_SOURCES = {
    "naukri_API": "https://www.naukri.com/",
    "Cutshort_Scraping": "https://cutshort.io/profile/recommended-jobs"
}


def collect_job_data_from_api(url, headers=None, params=None):
    """Fetch job data using an API."""
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        print("Data collected from API successfully!")
        return data  # Return JSON data from API response
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None


def collect_job_data_from_web(url):
    """Scrape job data from a web page."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Example scraping logic: Adapt selectors based on the website's structure
        jobs = []
        for job_card in soup.select('.result'):
            title = job_card.select_one('.title').text.strip()
            company = job_card.select_one('.company').text.strip()
            location = job_card.select_one('.location').text.strip()
            description = job_card.select_one('.summary').text.strip()
            
            jobs.append({
                'title': title,
                'company': company,
                'location': location,
                'description': description
            })
        print("Data collected from web scraping successfully!")
        return jobs
    except requests.exceptions.RequestException as e:
        print(f"Web scraping request failed: {e}")
        return None


# Example for LinkedIn API (replace headers and parameters with actual API requirements)
def fetch_jobs_from_naukri():
    url = JOB_SOURCES["naukri_API"]
    headers = {
        "Authorization": f"Bearer {API_KEYS['example_platform']}"
    }
    params = {
        "keywords": "software developer",
        "location": "Remote"
    }
    return collect_job_data_from_api(url, headers=headers, params=params)


# Example for Indeed scraping
def fetch_jobs_from_cutshort():
    url = JOB_SOURCES["Cutshort_Scraping"]
    return collect_job_data_from_web(url)


# Collecting data from both sources
def main():
    # Fetch data from LinkedIn API
    naukri_jobs = fetch_jobs_from_naukri()
    if naukri_jobs:
        print(json.dumps(naukri_jobs, indent=2))

    # Fetch data from Indeed using scraping
    cutshort_jobs = fetch_jobs_from_cutshort()
    if cutshort_jobs:
        print(json.dumps(cutshort_jobs, indent=2))


if __name__ == "__main__":
    main()


API request failed: Expecting value: line 1 column 1 (char 0)
Data collected from web scraping successfully!


In [7]:

pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install spacy

Collecting spacy
  Downloading spacy-3.8.2-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp312-cp312-win_amd64.

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.0.2 which is incompatible.
langchain 0.3.4 requires numpy<2.0.0,>=1.26.0; python_version >= "3.12", but you have numpy 2.0.2 which is incompatible.
langchain-community 0.3.3 requires numpy<2.0.0,>=1.26.0; python_version >= "3.12", but you have numpy 2.0.2 which is incompatible.
numba 0.59.1 requires numpy<1.27,>=1.22, but you have numpy 2.0.2 which is incompatible.
pywavelets 1.5.0 requires numpy<2.0,>=1.22.4, but you have numpy 2.0.2 which is incompatible.
streamlit 1.32.0 requires numpy<2,>=1.19.3, but you have numpy 2.0.2 which is incompatible.


In [13]:
pip install PyMuPDF python-docx spacy


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-win_amd64.whl (16.2 MB)
   ---------------------------------------- 0.0/16.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.2 MB 330.3 kB/s eta 0:00:50
   ---------------------------------------- 0.2/16.2 MB 1.4 MB/s eta 0:00:12
   -- ------------------------------------- 0.8/16.2 MB 6.0 MB/s eta 0:00:03
   --- ------------------------------------ 1.4/16.2 MB 6.7 MB/s eta 0:00:03
   ---- ----------------------------------- 2.0/16.2 MB 8.1 MB/s eta 0:00:02
   ------ --------------------------------- 2.8/16.2 MB 9.5 MB/s eta 0:00:02
   ---------- ----------------------------- 4.4/16.2 MB 12.7 MB/s eta 0:00:01
   ------------- -------------------------- 5.5/16.2 MB 14.2 MB/s eta 0:00:

In [16]:
pip install PyMuPDF python-docx spacy


Note: you may need to restart the kernel to use updated packages.


In [22]:
import requests
from bs4 import BeautifulSoup
import json

# Sample URLs and API keys for different job platforms
API_KEYS = {
    'example_platform': 'YOUR_API_KEY_HERE'
}

JOB_SOURCES = {
    "naukri_API": "https://www.naukri.com/",
    "Cutshort_Scraping": "https://cutshort.io/profile/recommended-jobs"
}


def collect_job_data_from_api(url, headers=None, params=None):
    """Fetch job data using an API."""
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        print("Data collected from API successfully!")
        return data  # Return JSON data from API response
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None


def collect_job_data_from_web(url):
    """Scrape job data from a web page."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Example scraping logic: Adapt selectors based on the website's structure
        jobs = []
        for job_card in soup.select('.result'):
            title = job_card.select_one('.title').text.strip()
            company = job_card.select_one('.company').text.strip()
            location = job_card.select_one('.location').text.strip()
            description = job_card.select_one('.summary').text.strip()
            
            jobs.append({
                'title': title,
                'company': company,
                'location': location,
                'description': description
            })
        print("Data collected from web scraping successfully!")
        return jobs
    except requests.exceptions.RequestException as e:
        print(f"Web scraping request failed: {e}")
        return None


# Example for LinkedIn API (replace headers and parameters with actual API requirements)
def fetch_jobs_from_naukri():
    url = JOB_SOURCES["naukri_API"]
    headers = {
        "Authorization": f"Bearer {API_KEYS['example_platform']}"
    }
    params = {
        "keywords": "software developer",
        "location": "Remote"
    }
    return collect_job_data_from_api(url, headers=headers, params=params)


# Example for Indeed scraping
def fetch_jobs_from_cutshort():
    url = JOB_SOURCES["Cutshort_Scraping"]
    return collect_job_data_from_web(url)


# Collecting data from both sources
def main():
    # Fetch data from LinkedIn API
    naukri_jobs = fetch_jobs_from_naukri()
    if naukri_jobs:
        print(json.dumps(naukri_jobs, indent=2))

    # Fetch data from Indeed using scraping
    cutshort_jobs = fetch_jobs_from_cutshort()
    if cutshort_jobs:
        print(json.dumps(cutshort_jobs, indent=2))


if __name__ == "__main__":
    main()



API request failed: Expecting value: line 1 column 1 (char 0)
Data collected from web scraping successfully!
