FreshersWorld Job Scrapper

In [None]:
!pip install requests beautifulsoup4 pandas




Scrap Single Page First

In [None]:
# Extract job postings
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the Freshersworld URL for scraping
url = "https://www.freshersworld.com/jobs/jobsearch"

# Send request and get page content
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Page fetched successfully!")
    soup = BeautifulSoup(response.text, "html.parser")

    # Extracting job listings
    jobs = []

    for job in soup.find_all("div", class_="job-desc-block"):
        try:
            title = job.find("span", class_="wrap-title seo_title").text.strip()
        except:
            title = "N/A"

        try:
            company = job.find("h3", class_="latest-jobs-title font-16 margin-none inline-block company-name").text.strip()
        except:
            company = "N/A"

        try:
            location = job.find("span", class_="job-location display-block modal-open job-details-span").text.strip()
        except:
            location = "N/A"

        try:
            experience = job.find("span", class_="experience job-details-span").text.strip()
        except:
            experience = "N/A"

        try:
            salary = job.find("span", class_="qualifications display-block modal-open pull-left job-details-span").text.strip()
        except:
            salary = "N/A"

        jobs.append([title, company, location, experience, salary])

    # Convert to DataFrame
    df = pd.DataFrame(jobs, columns=["Job Title", "Company", "Location", "Experience", "Salary"])

    # Save to Excel
    df.to_excel("Freshersworld_Jobs.xlsx", index=False)

    print("Data saved successfully!")
else:
    print("Failed to fetch the page.")


Page fetched successfully!
Data saved successfully!


Srap Multiple Pages

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to scrape job listings from multiple pages
def scrape_freshersworld_jobs(pages=5):
    base_url = "https://www.freshersworld.com/jobs/jobsearch"
    job_list = []

    for page in range(1, pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?&limit=20&offset={(page-1) * 20}"
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})

        if response.status_code != 200:
            print("Failed to fetch page", page)
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        job_blocks = soup.find_all('div', class_='col-md-12 col-xs-12 job_listing_alignment')

        for job in job_blocks:
            try:
                title = job.find('span', class_='wrap-title seo_title').text.strip()
                company = job.find('h3', class_='latest-jobs-title').text.strip()
                location = job.find('span', class_='job-location').text.strip()
                experience = job.find('span', class_='experience job-details-span').text.strip()
                salary = job.find('span', class_='qualifications display-block').text.strip()

                job_list.append([title, company, location, experience, salary])
            except AttributeError:
                continue

        time.sleep(2)  # Adding delay to avoid getting blocked

    return job_list

# Scraping jobs from multiple pages
jobs_data = scrape_freshersworld_jobs(pages=5)

# Creating DataFrame
df = pd.DataFrame(jobs_data, columns=['Job Title', 'Company', 'Location', 'Experience', 'Salary'])

# Saving to Excel
excel_file = "/content/Freshersworld_Jobs.xlsx"
df.to_excel(excel_file, index=False)
print(f"Scraped {len(df)} job postings and saved to Excel: {excel_file}")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraped 0 job postings and saved to Excel: /content/Freshersworld_Jobs.xlsx


Updating excel sheet as new job posting

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Path to the existing Excel file
excel_file = "/content/Freshersworld_Jobs.xlsx"

# Function to read existing job data
def read_existing_jobs(file_path):
    if os.path.exists(file_path):
        return pd.read_excel(file_path)
    return pd.DataFrame(columns=['Job Title', 'Company', 'Location', 'Experience', 'Salary'])

# Function to scrape job listings
def scrape_freshersworld_jobs(pages=5):
    base_url = "https://www.freshersworld.com/jobs/jobsearch"
    job_list = []

    for page in range(1, pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?&limit=20&offset={(page-1) * 20}"
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})

        if response.status_code != 200:
            print("Failed to fetch page", page)
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        job_blocks = soup.find_all('div', class_='col-md-12 col-xs-12 job_listing_alignment')

        for job in job_blocks:
            try:
                title = job.find('span', class_='wrap-title seo_title').text.strip()
                company = job.find('h3', class_='latest-jobs-title').text.strip()
                location = job.find('span', class_='job-location').text.strip()
                experience = job.find('span', class_='experience job-details-span').text.strip()
                salary = job.find('span', class_='qualifications display-block').text.strip()

                job_list.append([title, company, location, experience, salary])
            except AttributeError:
                continue

        time.sleep(2)  # Adding delay to avoid getting blocked

    return pd.DataFrame(job_list, columns=['Job Title', 'Company', 'Location', 'Experience', 'Salary'])

# Load existing jobs
existing_jobs = read_existing_jobs(excel_file)

# Scrape new job data
new_jobs = scrape_freshersworld_jobs(pages=5)

# Combine old and new data, remove duplicates
updated_jobs = pd.concat([existing_jobs, new_jobs]).drop_duplicates().reset_index(drop=True)

# Save updated job listings to the same Excel file
updated_jobs.to_excel(excel_file, index=False)

print(f"Updated job postings saved to: {excel_file}")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Updated job postings saved to: /content/Freshersworld_Jobs.xlsx


Prototype the model with a single record

In [None]:
from google.colab import drive
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

# Mount Google Drive
drive.mount('/content/drive')

# Set path for saving the Excel file
excel_path = "/content/drive/MyDrive/JobScraper/Freshersworld_Jobs.xlsx"

# Function to scrape job listings from multiple pages
def scrape_freshersworld_jobs(pages=5):
    base_url = "https://www.freshersworld.com/jobs/jobsearch"
    job_list = []

    for page in range(1, pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?&limit=20&offset={(page-1) * 20}"
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})

        if response.status_code != 200:
            print("Failed to fetch page", page)
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        job_blocks = soup.find_all('div', class_='col-md-12 col-xs-12 job_listing_alignment')

        for job in job_blocks:
            try:
                title = job.find('span', class_='wrap-title seo_title').text.strip()
                company = job.find('h3', class_='latest-jobs-title').text.strip()
                location = job.find('span', class_='job-location').text.strip()
                experience = job.find('span', class_='experience job-details-span').text.strip()
                salary = job.find('span', class_='qualifications display-block').text.strip()

                job_list.append([title, company, location, experience, salary])
            except AttributeError:
                continue

        time.sleep(2)  # Adding delay to avoid getting blocked

    return job_list

# Load existing data if file exists
if os.path.exists(excel_path):
    existing_df = pd.read_excel(excel_path)
else:
    existing_df = pd.DataFrame(columns=['Job Title', 'Company', 'Location', 'Experience', 'Salary'])

# Scrape jobs
new_jobs_data = scrape_freshersworld_jobs(pages=5)
new_df = pd.DataFrame(new_jobs_data, columns=['Job Title', 'Company', 'Location', 'Experience', 'Salary'])

# Append new jobs and remove duplicates
final_df = pd.concat([existing_df, new_df]).drop_duplicates().reset_index(drop=True)

# Save updated data back to Excel
final_df.to_excel(excel_path, index=False)
print(f"Updated Excel file saved at: {excel_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Updated Excel file saved at: /content/drive/MyDrive/JobScraper/Freshersworld_Jobs.xlsx


In [1]:
!git config --global user.name "vanshmodii"
!git config --global user.email "vanshmodi5@gmail.com"


In [3]:
!git clone https://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@github.com/vanshmodii/Freshersworld-Scraper.git

Cloning into 'Freshersworld-Scraper'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [None]:
import shutil

# Move the Excel file
shutil.move("/mnt/data/Freshersworld_Jobs.xlsx", "/content/Freshersworld-Scraper")

# Move the Python script
shutil.move("/content/Freshersworld_Scraper.py", "/content/Freshersworld-Scraper")
