In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time

In [4]:
BASE_URL = "https://remoteok.com"
headers = {"User-Agent": "Mozilla/5.0"}

In [5]:
def scrape_listing_page():
    url = f"{BASE_URL}/remote-dev-jobs"
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.content, "html.parser")
    job_rows = soup.find_all("tr", class_="job")
    
    job_links = []
    for job in job_rows:
        job_info = {
            "title": job.get("data-position"),
            "company": job.get("data-company"),
            "tags": job.get("data-tags"),
            "job_url": BASE_URL + job.get("data-url") if job.get("data-url") else None
        }
        job_links.append(job_info)
    return job_links

In [6]:
def scrape_detail_page(job):
    url = job["job_url"]
    if not url:
        return job

    try:
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.content, "html.parser")
        
        # Description
        desc_block = soup.find("div", class_="description") or soup.find("div", {"id": "job-description"})
        job["description"] = desc_block.get_text(separator=" ").strip() if desc_block else None
        
        # Location, Salary, Experience (from text)
        text = job["description"].lower() if job["description"] else ""
        job["location"] = "remote" if "remote" in text else "not specified"
        
        for keyword in ["full-time", "part-time", "contract", "internship"]:
            if keyword in text:
                job["job_type"] = keyword
                break
        else:
            job["job_type"] = "not specified"
        
        # Salary (rough guess)
        import re
        salary_match = re.search(r"\$\d{2,3}[kK]?", text)
        job["salary"] = salary_match.group() if salary_match else None

        # Experience
        exp_match = re.search(r"\d+\+? ?years?", text)
        job["experience_level"] = exp_match.group() if exp_match else "not specified"

        # Deadline (heuristic)
        if "apply before" in text:
            job["deadline"] = text.split("apply before")[-1].split(".")[0].strip()
        else:
            job["deadline"] = "not specified"
        
        return job

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return job

In [7]:
# Run both passes
print("🔎 Scraping listing page...")
jobs_basic = scrape_listing_page()

print("🔎 Scraping individual job pages...")
jobs_full = []
for job in tqdm(jobs_basic):
    job_full = scrape_detail_page(job)
    jobs_full.append(job_full)
    time.sleep(1)  # be nice to the server

🔎 Scraping listing page...
🔎 Scraping individual job pages...


100%|██████████| 19/19 [00:32<00:00,  1.70s/it]


In [12]:
# Save to CSV
df = pd.DataFrame(jobs_full)
df.to_csv("full_jobs_remoteok.csv", index=False)
print("✅ Done. Saved to 'data/full_jobs_remoteok.csv'")


✅ Done. Saved to 'data/full_jobs_remoteok.csv'


In [13]:
# scripts/scrape_remoteok_api.py
import requests
import pandas as pd

def scrape_remoteok():
    url = "https://remoteok.com/api"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    data = resp.json()

    jobs = []
    for job in data:
        if job.get("company") and job.get("position"):
            jobs.append({
                "title": job.get("position"),
                "company": job.get("company"),
                "location": job.get("location", "Remote"),
                "tags": ",".join(job.get("tags", [])),
                "date": job.get("date"),
                "job_url": f"https://remoteok.com{job.get('url')}",
                "description": job.get("description", "").strip()
            })
    return pd.DataFrame(jobs)

if __name__ == "__main__":
    df = scrape_remoteok()
    print(f"✅ Fetched {len(df)} jobs")
    df.to_csv("remoteok_jobs.csv", index=False)


✅ Fetched 99 jobs


In [15]:
# scripts/scrape_internshala.py
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_internshala(query="Data Analyst"):
    driver = webdriver.Chrome()
    driver.get(f"https://internshala.com/internships/{query.replace(' ', '-')}-internship")
    time.sleep(5)  # wait for page load

    soup = BeautifulSoup(driver.page_source, "html.parser")
    postings = soup.find_all("div", class_="individual_internship")

    jobs = []
    for post in postings:
        title = post.find("h4").text.strip()
        company = post.find("div", class_="company_name").text.strip()
        loc = post.find("a", class_="location_link").text.strip()
        stipend = post.find("span", class_="stipend").text.strip()
        date = post.find("div", class_="item_body").text.strip()

        jobs.append({
            "title": title,
            "company": company,
            "location": loc,
            "stipend": stipend,
            "posted": date
        })

    driver.quit()
    return pd.DataFrame(jobs)

if __name__ == "__main__":
    df = scrape_internshala("python")
    print(f"✅ Fetched {len(df)} internships")
    df.to_csv("internshala_jobs.csv", index=False)


✅ Fetched 0 internships
