In [1]:

"""
remoteok_scraper.py
Fetch RemoteOK jobs (via JSON API if available) and save to remoteok_jobs.csv
Fields saved: company, role, location, tags, remoteok_url
"""

import requests
import pandas as pd
from bs4 import BeautifulSoup

ENDPOINTS = [
    "https://remoteok.com/api",
    "https://remoteok.io/api",
    "https://remoteok.com/json",
    "https://remoteok.com/remote-jobs.json",
    "https://remoteok.com/r"   # fallback
]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; RemoteOKScraper/1.0; +https://github.com/yourusername)"
}

def try_json_endpoints():
    for url in ENDPOINTS:
        try:
            print(f"Trying JSON endpoint: {url}")
            resp = requests.get(url, headers=HEADERS, timeout=12)
            resp.raise_for_status()
            # If endpoint returns HTML (not JSON), .json() will raise
            data = resp.json()
            # many RemoteOK feeds start with a meta object (id == 0), filter it out
            if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and data[0].get("id") == 0:
                data = data[1:]
            if isinstance(data, list):
                return data, url
        except Exception as e:
            print(f"  -> failed: {e}")
    return None, None

def parse_json_list(data):
    rows = []
    for job in data:
        if not isinstance(job, dict):
            continue
        company = job.get("company") or job.get("company_name") or ""
        role = job.get("position") or job.get("title") or ""
        # location can be a string or list
        location = job.get("location") or job.get("locations") or ""
        # tags usually a list
        tags = job.get("tags") or job.get("tags_list") or []
        if isinstance(tags, list):
            tags = ",".join([str(t).strip() for t in tags])
        else:
            tags = str(tags)
        url = job.get("url") or job.get("link") or job.get("apply_url") or ""
        rows.append({
            "company": company.strip(),
            "role": role.strip(),
            "location": location if isinstance(location, str) else ",".join(location),
            "tags": tags,
            "remoteok_url": url
        })
    return rows

def html_fallback(url="https://remoteok.com/r"):
    print("Falling back to HTML parsing:", url)
    resp = requests.get(url, headers=HEADERS, timeout=12)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")
    rows = []
    # RemoteOK frequently marks job rows with <tr class="job" data-company="..." ...>
    for tr in soup.find_all("tr", class_="job"):
        company = tr.get("data-company") or (tr.find("td", class_="company") and tr.find("td", class_="company").get_text(strip=True)) or ""
        role = tr.get("data-position") or (tr.find("h2") and tr.find("h2").get_text(strip=True)) or ""
        tags_attr = tr.get("data-tags") or ""
        # try to collect tags from tags container too
        if not tags_attr:
            tags = [t.get_text(strip=True) for t in tr.select(".tags .tag")]
            tags_attr = ",".join(tags)
        location = tr.get("data-location") or ""
        # attempt to build absolute URL
        a = tr.find("a", attrs={"itemprop": "url"})
        link = ""
        if a and a.get("href"):
            href = a.get("href")
            link = href if href.startswith("http") else "https://remoteok.com" + href
        rows.append({
            "company": company.strip(),
            "role": role.strip(),
            "location": location,
            "tags": tags_attr,
            "remoteok_url": link
        })
    return rows

def save_csv(rows, filename="remoteok_jobs.csv"):
    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False)
    print(f"\nSaved {len(df)} rows -> {filename}")

def main():
    print("RemoteOK scraper starting...")
    data, used_url = try_json_endpoints()
    if data:
        print("JSON data loaded from:", used_url)
        rows = parse_json_list(data)
        if rows:
            save_csv(rows)
            return
        else:
            print("JSON returned zero jobs, trying HTML fallback.")
    # fallback to HTML parse
    try:
        rows = html_fallback("https://remoteok.com/r")
        if rows:
            save_csv(rows)
        else:
            print("HTML fallback found no jobs.")
    except Exception as e:
        print("HTML fallback failed:", e)

if __name__ == "__main__":
    main()


RemoteOK scraper starting...
Trying JSON endpoint: https://remoteok.com/api
JSON data loaded from: https://remoteok.com/api

Saved 96 rows -> remoteok_jobs.csv


In [2]:
from google.colab import files
files.download('remoteok_jobs.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>