# JobSpy Testing Notebook

Interactive notebook for testing the JobSpy library for job scraping.

In [6]:
# Import libraries
from jobspy import scrape_jobs
import pandas as pd

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', None)

## 1. Basic Job Search

Simple search across multiple job sites.

In [7]:
# Basic search - modify these parameters as needed
jobs = scrape_jobs(
    site_name=["indeed", "linkedin"],  # Options: indeed, linkedin, glassdoor, zip_recruiter, google
    search_term="software engineer",
    location="San Francisco, CA",
    results_wanted=5,
    hours_old=72,
    country_indeed='USA'
)

print(f"Found {len(jobs)} jobs")
jobs[['title', 'company', 'location', 'date_posted']].head(10)

Found 10 jobs


Unnamed: 0,title,company,location,date_posted
0,Software Developer 5 (OS Kernel),Oracle,"Santa Clara, CA, US",2026-01-31
1,Senior Software Engineer (Core Infrastructure),Build Technologies,"San Francisco, CA, US",2026-01-31
2,"Software Development Engineer II, SPB Advertis...",Amazon.com,"Palo Alto, CA, US",2025-10-09
3,Software Engineer II,Amazon.com,"San Francisco, CA, US",2025-10-09
4,"Software Development Engineer, Frontier AI & R...",Amazon.com,"San Francisco, CA, US",2025-02-28
5,"Software Engineer, Fullstack, Early Career",Notion,"San Francisco, CA",2026-01-30
6,"Software Engineer, Developer Experience",Notion,"San Francisco, CA",2026-01-30
7,"Software Engineer, New Grad",Stripe,"San Francisco, CA",2026-01-29
8,"Software Engineer, IS&T Early Career Opportuni...",Apple,"Sunnyvale, CA",2026-01-29
9,"Software Engineer, IS&T Early Career Opportuni...",Apple,"Cupertino, CA",2026-01-29


## 2. Custom Search Parameters

Modify the search parameters below to test different queries.

In [8]:
# === CUSTOMIZE YOUR SEARCH HERE ===
SEARCH_TERM = "research scientist"      # Job title or keywords
LOCATION = "Georgia"                    # City, state, or country
SITES = ["indeed", "linkedin"]          # Job sites to scrape
MAX_RESULTS =  100                       # Results per site
HOURS_OLD = 48                         # 168 = 7 days
JOB_TYPE =  "fulltime"                     # Options: fulltime, parttime, internship, contract, or None
IS_REMOTE = False                       # True for remote jobs only

# Run the search
custom_jobs = scrape_jobs(
    site_name=SITES,
    search_term=SEARCH_TERM,
    location=LOCATION,
    results_wanted=MAX_RESULTS,
    hours_old=HOURS_OLD,
    job_type=JOB_TYPE,
    is_remote=IS_REMOTE,
    country_indeed='USA'
)

print(f"Found {len(custom_jobs)} jobs for '{SEARCH_TERM}' in {LOCATION}")

Found 28 jobs for 'research scientist' in Georgia


In [9]:
# Display search results
custom_jobs[['title', 'company', 'location', 'date_posted', 'job_url']]

Unnamed: 0,title,company,location,date_posted,job_url
0,"Scientist, Assistant Academic Research- School...",Emory University,"Atlanta, GA, US",2026-01-31,https://www.indeed.com/viewjob?jk=ab674f3c7486...
1,AI and Emerging Technologies Product Manager (...,Deloitte,"Atlanta, GA, US",2026-01-30,https://www.indeed.com/viewjob?jk=07a69939b7fa...
2,Gen AI Architect,Tata Consultancy Services (TCS),"Villa Rica, GA, US",2026-01-30,https://www.indeed.com/viewjob?jk=90ed853973bf...
3,"Senior Technical Director, GMP Facility- Winsh...",Emory University,"Atlanta, GA, US",2026-01-30,https://www.indeed.com/viewjob?jk=9c766b1ed3d2...
4,R&D Scientist,Fuji Vegetable Oil USA,"Savannah, GA, US",2026-01-30,https://www.indeed.com/viewjob?jk=8cb874240029...
5,Analytical Staff Scientist,Imerys,"Johns Creek, GA, US",2026-01-30,https://www.indeed.com/viewjob?jk=a6fc5accc4d9...
6,"Scientist, Incident Management",The Coca-Cola Company,"Atlanta, GA, US",2026-01-30,https://www.indeed.com/viewjob?jk=68f9ed80905f...
7,"Principal Data Scientist, Agentic AI Professio...",Amazon Web Services,"Atlanta, GA, US",2025-12-17,https://www.indeed.com/viewjob?jk=121f3ae5c484...
8,Clinical Researcher,DataAnnotation,"Georgia, United States",2026-01-30,https://www.linkedin.com/jobs/view/4304435523
9,Senior Food Scientist - Sauces/Condiments,CSG Talent,"Atlanta, GA",2026-01-30,https://www.linkedin.com/jobs/view/4367424770


## URL Validation

Check which job URLs are still live and accessible.

In [10]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

def check_url(row):
    """Check if a job URL returns a valid response."""
    url = row['job_url']
    direct_url = row.get('job_url_direct')
    
    result = {'index': row.name, 'job_url_status': None, 'direct_url_status': None, 'best_url': None}
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
    
    # Check main job_url
    try:
        resp = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
        result['job_url_status'] = resp.status_code
    except:
        result['job_url_status'] = 'timeout/error'
    
    # Check job_url_direct if available
    if pd.notna(direct_url) and direct_url:
        try:
            resp = requests.head(str(direct_url), headers=headers, timeout=10, allow_redirects=True)
            result['direct_url_status'] = resp.status_code
        except:
            result['direct_url_status'] = 'timeout/error'
    
    # Pick the best working URL
    if result['direct_url_status'] == 200:
        result['best_url'] = direct_url
    elif result['job_url_status'] == 200:
        result['best_url'] = url
    else:
        result['best_url'] = url  # fallback
    
    return result

# Run URL checks in parallel
print("Checking URLs (this may take a moment)...\n")

results = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(check_url, row): idx for idx, row in custom_jobs.iterrows()}
    for future in as_completed(futures):
        results.append(future.result())

# Add results back to dataframe
url_df = pd.DataFrame(results).set_index('index')
custom_jobs['url_status'] = url_df['job_url_status']
custom_jobs['direct_url_status'] = url_df['direct_url_status']
custom_jobs['best_url'] = url_df['best_url']

# Summary
live = custom_jobs[custom_jobs['url_status'] == 200]
dead = custom_jobs[custom_jobs['url_status'] != 200]

print(f"✓ Live URLs: {len(live)}/{len(custom_jobs)}")
print(f"✗ Dead/blocked URLs: {len(dead)}/{len(custom_jobs)}\n")

# Show status for each job
for idx, row in custom_jobs.iterrows():
    status = "✓" if row['url_status'] == 200 else "✗"
    direct = ""
    if pd.notna(row.get('direct_url_status')):
        direct = f" | direct: {row['direct_url_status']}"
    print(f"{status} [{row['url_status']}] {row['title'][:50]} — {row['company']}{direct}")

Checking URLs (this may take a moment)...

✓ Live URLs: 20/28
✗ Dead/blocked URLs: 8/28

✗ [403] Scientist, Assistant Academic Research- School of  — Emory University | direct: 200.0
✗ [403] AI and Emerging Technologies Product Manager (Cons — Deloitte | direct: 200.0
✗ [403] Gen AI Architect — Tata Consultancy Services (TCS) | direct: 405.0
✗ [403] Senior Technical Director, GMP Facility- Winship C — Emory University | direct: 200.0
✗ [403] R&D Scientist — Fuji Vegetable Oil USA | direct: 200.0
✗ [403] Analytical Staff Scientist — Imerys | direct: 200.0
✗ [403] Scientist, Incident Management — The Coca-Cola Company | direct: 200.0
✗ [403] Principal Data Scientist, Agentic AI Professional  — Amazon Web Services | direct: 200.0
✓ [200] Clinical Researcher — DataAnnotation
✓ [200] Senior Food Scientist - Sauces/Condiments — CSG Talent
✓ [200] Postdoctoral Researcher — DataAnnotation
✓ [200] SFPara - Research Assistant — Inside Higher Ed
✓ [200] Postdoctoral Researcher, Physics — Kennesaw

In [27]:
pd.set_option('display.max_colwidth', None)

In [28]:
custom_jobs.iloc[1]

id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

## 3. View All Available Columns

See what data is available for each job listing.

In [18]:
# List all columns
print("Available columns:")
for col in sorted(custom_jobs.columns):
    print(f"  - {col}")

Available columns:
  - company
  - company_addresses
  - company_description
  - company_industry
  - company_logo
  - company_num_employees
  - company_rating
  - company_revenue
  - company_reviews_count
  - company_url
  - company_url_direct
  - currency
  - date_posted
  - description
  - emails
  - experience_range
  - id
  - interval
  - is_remote
  - job_function
  - job_level
  - job_type
  - job_url
  - job_url_direct
  - listing_type
  - location
  - max_amount
  - min_amount
  - salary_source
  - site
  - skills
  - title
  - vacancy_count
  - work_from_home_type


In [19]:
# View detailed info for a specific job (change index to explore different jobs)
job_index = 0

if len(custom_jobs) > job_index:
    job = custom_jobs.iloc[job_index]
    print(f"=== {job['title']} at {job['company']} ===")
    print(f"\nLocation: {job['location']}")
    print(f"Posted: {job['date_posted']}")
    print(f"URL: {job['job_url']}")
    
    if pd.notna(job.get('min_amount')) and pd.notna(job.get('max_amount')):
        print(f"Salary: ${job['min_amount']:,.0f} - ${job['max_amount']:,.0f}")
    
    if pd.notna(job.get('description')):
        print(f"\nDescription (first 500 chars):\n{job['description'][:500]}...")
else:
    print("No jobs found")

=== Scientist, Assistant Academic Research- School of Medicine at Emory University ===

Location: Atlanta, GA, US
Posted: 2026-01-31
URL: https://www.indeed.com/viewjob?jk=ab674f3c748683a7

Description (first 500 chars):
Discover Your Career at Emory University:

Emory University is a leading research university that fosters excellence and attracts world\-class talent to innovate today and prepare leaders for the future. We welcome candidates who can contribute to the excellence of our academic community.
Description:

JOB DESCRIPTION:
* Under minimal supervision, the Assistant Academic Research Scientist performs a wide range of highly technical and complex tasks necessary to advance basic and/or translational ...


## 4. Remote Jobs Search

In [20]:
# Search for remote positions
remote_jobs = scrape_jobs(
    site_name=["linkedin"],
    search_term="python developer",
    location="USA",
    results_wanted=5,
    is_remote=True
)

print(f"Found {len(remote_jobs)} remote jobs")
remote_jobs[['title', 'company', 'location', 'job_url']]

Found 5 remote jobs


Unnamed: 0,title,company,location,job_url
0,Software Engineer - Python,Tomo,,https://www.linkedin.com/jobs/view/4367494236
1,Python Developer,HireTalent - Staffing & Recruiting Firm,,https://www.linkedin.com/jobs/view/4366128640
2,Full Stack Engineer 5 - Revenue & Growth Tools,Netflix,,https://www.linkedin.com/jobs/view/4355680330
3,Python Developer - Make Healthcare Great Again...,Venchr,,https://www.linkedin.com/jobs/view/4329553573
4,Python Developer | Remote,Crossing Hurdles,,https://www.linkedin.com/jobs/view/4360839906


## 5. Export Results to CSV

In [8]:
# Save current search results to CSV
output_file = "job_results.csv"
custom_jobs.to_csv(output_file, index=False)
print(f"Saved {len(custom_jobs)} jobs to {output_file}")

Saved 20 jobs to job_results.csv


## 6. Filter and Analyze Results

In [9]:
# Filter jobs by keyword in title
keyword = "senior"  # Change this to filter by different keywords

filtered = custom_jobs[custom_jobs['title'].str.lower().str.contains(keyword, na=False)]
print(f"Jobs with '{keyword}' in title: {len(filtered)}")
filtered[['title', 'company', 'location']]

Jobs with 'senior' in title: 2


Unnamed: 0,title,company,location
2,"Senior Technical Director, GMP Facility- Winsh...",Emory University,"Atlanta, GA, US"
14,Senior Research Scientist,"Hopewell Designs, Inc.","Alpharetta, GA"


In [10]:
# Group by company
if not custom_jobs.empty:
    company_counts = custom_jobs['company'].value_counts()
    print("Jobs per company:")
    print(company_counts)

Jobs per company:
company
DataAnnotation                     4
Emory Healthcare                   2
Battelle                           2
Deloitte                           1
Tata Consultancy Services (TCS)    1
Emory University                   1
Fuji Vegetable Oil USA             1
Imerys                             1
The Coca-Cola Company              1
Siemens Healthineers               1
Amazon Web Services                1
Hopewell Designs, Inc.             1
Augusta University                 1
AmSpec Group                       1
Lensa                              1
Name: count, dtype: int64


## Quick Reference

### Available Sites
- `indeed`
- `linkedin`
- `glassdoor`
- `zip_recruiter`
- `google`

### Job Types
- `fulltime`
- `parttime`
- `internship`
- `contract`

### Key Parameters
| Parameter | Description |
|-----------|-------------|
| `search_term` | Job title or keywords |
| `location` | City, state, or "remote" |
| `results_wanted` | Max results per site |
| `hours_old` | Filter by posting age (hours) |
| `job_type` | Employment type filter |
| `is_remote` | True for remote only |