In [36]:
!pip install requests beautifulsoup4 pandas lxml




In [45]:
import requests  #fetch webpages
from bs4 import BeautifulSoup #extract content from http
import pandas as pd
import time #pause btw requests
import random # randomize delay


In [38]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
}


In [72]:
SEARCH_ROLES = {
    "data_scientist": "data-scientist",
    "machine_learning_engineer": "machine-learning-engineer",
    "data_analyst": "data-analyst",
    "python_developer": "python-developer"
}

BASE_URL = "https://internshala.com/jobs/{role}-jobs/"


In [73]:
def get_job_links(role_url, max_links=10):
    response = requests.get(role_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []

    for a in soup.find_all("a", href=True):
        href = a.get("href")
        if href.startswith("/job/detail/"):
            links.append("https://internshala.com" + href)

    return list(dict.fromkeys(links))[:max_links]


In [74]:
test_role = SEARCH_ROLES["data_scientist"]
test_url = BASE_URL.format(role=test_role)

print("Testing URL:", test_url)

test_links = get_job_links(test_url, max_links=5)

print("Job links found:")
for link in test_links:
    print(link)


Testing URL: https://internshala.com/jobs/data-scientist-jobs/
Job links found:
https://internshala.com/job/detail/data-scientist-job-in-mumbai-at-dhurin1767519139
https://internshala.com/job/detail/data-scientist-job-in-ahmedabad-at-growexx1767177208
https://internshala.com/job/detail/data-scientist-job-in-haryana-at-namshicom1767521410
https://internshala.com/job/detail/remote-data-scientist-job-at-credgenics1767868989
https://internshala.com/job/detail/data-scientist-job-in-maharashtra-at-arkray-inc1767958147


In [75]:
test_links = get_job_links(
    BASE_URL.format(role=SEARCH_ROLES["data_scientist"]),
    max_links=5
)

print("Links found:", len(test_links))
for l in test_links:
    print(l)


Links found: 5
https://internshala.com/job/detail/data-scientist-job-in-mumbai-at-dhurin1767519139
https://internshala.com/job/detail/data-scientist-job-in-ahmedabad-at-growexx1767177208
https://internshala.com/job/detail/data-scientist-job-in-haryana-at-namshicom1767521410
https://internshala.com/job/detail/remote-data-scientist-job-at-credgenics1767868989
https://internshala.com/job/detail/data-scientist-job-in-maharashtra-at-arkray-inc1767958147


## Job scraping

In [80]:
def scrape_job_page(job_url):
    response = requests.get(job_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    # job title
    title_tag = soup.find("h1")
    job_title = title_tag.get_text(strip=True) if title_tag else ""

    # company
    company_tag = soup.find("div", class_="company_name")
    company = company_tag.get_text(strip=True) if company_tag else ""

    # location
    location_tag = soup.find("span", class_="location_link")
    location = location_tag.get_text(strip=True) if location_tag else ""

    # job description (ROBUST)
    desc_container = soup.find("div", class_="internship_details")
    if desc_container:
        job_description = desc_container.get_text(separator=" ", strip=True)
    else:
        job_description = ""

    return {
        "job_title": job_title,
        "company": company,
        "location": location,
        "job_description": job_description
    }


In [82]:
# DEBUG: inspect one job page fully
test_link = get_job_links(
    BASE_URL.format(role=SEARCH_ROLES["data_scientist"]),
    max_links=1
)[0]

job_data = scrape_job_page(test_link)

print("Keys:", job_data.keys())
for k, v in job_data.items():
    print(f"\n{k}:\n{v[:1000]}")


Keys: dict_keys(['job_title', 'company', 'location', 'job_description'])

job_title:
Data Scientist

company:
Dhurin

location:


job_description:
About the job About the Company Dhurin is a fast-growing data science firm serving financial services industry. We work with banks, NBFCs, fintech and other financial institutions. Our work is in the space of risk, fraud, collections, marketing, sales, and operations through it we directly influence how financial institutions make smarter decisions. Dhurin is the place for someone who is passionate about data and wants to grow with a well-positioned firm. Role Overview We are looking for Data Scientists (AI/ML Developers) across levels to join our Mumbai team. The role focuses on credit and fraud risk analytics within the retail banking space. You will design, develop, and implement ML models that drive smarter decision-making for financial institutions. Responsibilities & Requirements 2 to 10 years of AI/ML/ Data Science experience with exp

In [83]:
jobs = []

for role_name, role_slug in SEARCH_ROLES.items():
    search_url = BASE_URL.format(role=role_slug)
    print(f"\nScraping role: {role_name}")

    job_links = get_job_links(search_url, max_links=10)
    print(f"Found {len(job_links)} links")

    for link in job_links:
        print("  Scraping:", link)
        job_data = scrape_job_page(link)

        # Quality filter: ensure meaningful JD
        if len(job_data["job_description"]) > 300:
            jobs.append(job_data)

        time.sleep(random.uniform(1.5, 3))



Scraping role: data_scientist
Found 10 links
  Scraping: https://internshala.com/job/detail/data-scientist-job-in-mumbai-at-dhurin1767519139
  Scraping: https://internshala.com/job/detail/data-scientist-job-in-ahmedabad-at-growexx1767177208
  Scraping: https://internshala.com/job/detail/data-scientist-job-in-haryana-at-namshicom1767521410
  Scraping: https://internshala.com/job/detail/remote-data-scientist-job-at-credgenics1767868989
  Scraping: https://internshala.com/job/detail/data-scientist-job-in-maharashtra-at-arkray-inc1767958147
  Scraping: https://internshala.com/job/detail/fresher-remote-data-scientist-job-at-v4cai1767783488
  Scraping: https://internshala.com/job/detail/remote-data-scientist-job-at-mindsprint1767871987
  Scraping: https://internshala.com/job/detail/remote-data-scientist-job-at-digital-ipsum1765446548
  Scraping: https://internshala.com/job/detail/fresher-remote-data-scientist-job-at-lendinghelio1764741581
  Scraping: https://internshala.com/job/detail/fresh

In [85]:
# creating & verifying dataset
df_jobs = pd.DataFrame(jobs)

print("Before cleaning:", df_jobs.shape)

df_jobs.drop_duplicates(subset="job_description", inplace=True)
df_jobs = df_jobs[df_jobs["job_description"].notna()]
df_jobs = df_jobs[df_jobs["job_description"].str.len() > 300]

print("After cleaning:", df_jobs.shape)

df_jobs.head()


Before cleaning: (39, 4)
After cleaning: (39, 4)


Unnamed: 0,job_title,company,location,job_description
0,Data Scientist,Dhurin,,About the job About the Company Dhurin is a fa...
1,Data Scientist,GrowExx,,About the job About the Company Growexx is loo...
2,Data Scientist,Namshi.com,,About the job Title: Data Scientist Location: ...
3,Data Scientist,Credgenics,,About the job About Credgenics: Credgenics is ...
4,Data Scientist,"ARKRAY, Inc.",,About the job Job title: Data Scientist (Full ...


In [86]:
print("Total jobs collected:", len(jobs))


Total jobs collected: 39


In [87]:
jobs[0].keys()



dict_keys(['job_title', 'company', 'location', 'job_description'])

In [88]:
print(jobs[0]['job_title'])
print(jobs[0]['company'])
print(len(jobs[0]['job_description']))


Data Scientist
Dhurin
2613


In [89]:
import os

output_dir = "resumes"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "job_descriptions.csv")

df_jobs.to_csv(output_path, index=False, encoding="utf-8")

print("Saved to:", output_path)


Saved to: resumes\job_descriptions.csv


In [90]:
import os
print(os.path.getsize(output_path))


138562


In [91]:
df_test = pd.read_csv(output_path)
df_test.shape
df_test.head(2)


Unnamed: 0,job_title,company,location,job_description
0,Data Scientist,Dhurin,,About the job About the Company Dhurin is a fa...
1,Data Scientist,GrowExx,,About the job About the Company Growexx is loo...
