In [16]:
# import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urljoin
from retry import retry 

In [51]:
# v3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm

def scrape_website(url):
    """
    Scrapes data from www.payscale.com for all job titles and industries.

    Parameters:
    url (str): The URL of the website.

    Returns:
    List of Dictionaries: Each dictionary contains the data for a job title.
    """
    base_url = "https://www.payscale.com"
    
    all_jobs_data = []  # List to store dictionaries for each job title

    # Step 1: Get the list of industries
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    industry_links = soup.find_all(class_='related-content-card')

    for industry_link in tqdm(industry_links, desc="Industries", unit="industry"):
        industry_url = urljoin(base_url, industry_link['href'])
        
        # Step 2: Open each industry page
        industry_response = requests.get(industry_url)
        industry_soup = BeautifulSoup(industry_response.text, 'html.parser')
        
        # Step 3: Get job title links
        job_title_links = industry_soup.find_all(class_='subcats__links__item')

        for job_title_link in tqdm(job_title_links, desc="Job Titles", unit="job title", leave=False):
            job_title_url = urljoin(base_url, job_title_link['href'])
            
            # Step 4: Open each job title page
            job_title_response = requests.get(job_title_url)
            job_title_soup = BeautifulSoup(job_title_response.text, 'html.parser')
            
            # Step 5: Get info for specified classes
            try:
                wage = job_title_soup.find(class_='paycharts__value').text
            except AttributeError:
                wage = None
            # Find all job titles
            try:
                job_title = job_title_soup.find(class_='rc-sub-nav__title').text
            except AttributeError:
                job_title = None
            # Find all bases for wage
            try:
                bases = job_title_soup.find_all(class_="Dropdown-placeholder")[0].text
            except (AttributeError, IndexError):
                bases = None
            # Find all experience levels
            try:
                exp_elements = job_title_soup.find_all(class_="name")
                exp_lvl = [element.text for element in exp_elements]
            except AttributeError:
                exp_lvl = None
            # Find all percentage changes for experience levels
            try:
                perc_change_elements = job_title_soup.find_all(class_="arrow")
                perc_change = [element.text for element in perc_change_elements]
            except AttributeError:
                perc_change = None
            # Find all salary tables
            try:
                salary_tables = [title.text for title in job_title_soup.find_all(class_="tablerow__title")]
            except AttributeError:
                salary_tables = None
            # Find all salary tables values
            try:
                salary_tables_values = [value.text for value in job_title_soup.find_all(class_="tablerow__value")]
            except AttributeError:
                salary_tables_values = None
            # Find how many respondents for wage
            try:
                respondents = job_title_soup.find(class_="paycharts__footer").text
            except AttributeError:
                respondents = None
            # Find job rating
            try:
                job_rating = job_title_soup.find(class_="job-rating--score").text
            except AttributeError:
                job_rating = None
            # Find how many respondents for job rating
            try:
                job_rating_count = job_title_soup.find(class_="job-rating--profile-count").text
            except AttributeError:
                job_rating_count = None
            # Find gender values
            try:
                gender_values = [value.text for value in job_title_soup.find_all(class_="gender__value")]
            except AttributeError:
                gender_values = None
            # Find gender labels
            try:
                gender_labels = [label.text for label in job_title_soup.find_all(class_="gender__label")]
            except AttributeError:
                gender_labels = None
            # Find how many respondents for gender distribution
            try:
                gender_dist = job_title_soup.find(class_="gender__blurb").text
            except AttributeError:
                gender_dist = None

            # Step 6: Create a dictionary for the job title and append to the list
            job_data = {
                "industry_url": industry_url,
                "job_title_url": job_title_url,
                "job_title": job_title,
                "wage": wage,
                "wage_base": bases,
                "experience_level": exp_lvl,
                "percentage_change": perc_change,
                "salaries": salary_tables,
                "salary_values": salary_tables_values,
                "respondents": respondents,
                "job_rating": job_rating,
                "job_rating_round": job_rating_count,
                "genders": gender_labels,
                "gender_values": gender_values,
                "gender_respondents": gender_dist
            }
            all_jobs_data.append(job_data) # Append the dictionary to the list

    return all_jobs_data

# URL of the main page
main_url = "https://www.payscale.com/research/US/Job"
all_jobs = scrape_website(main_url)

Industries:   0%|          | 0/29 [00:00<?, ?industry/s]

In [48]:
# show results as DataFrame
df = pd.DataFrame(all_jobs)
df

Unnamed: 0,industry_url,job_title_url,job_title,wage,wage_base,experience_level,percentage_change,salaries,salary_values,respondents,job_rating,job_rating_round,genders,gender_values,gender_respondents
0,https://www.payscale.com/research/US/Job/Accou...,https://www.payscale.com/research/US/Job=Staff...,Average Staff Accountant Salary,"$56,160",/ year,"[Entry Level, Early Career, Mid Career, Late C...","[▼8%, ▼1%, ▲5%, ▲5%, ▲6%]","[Base Salary, Bonus, Profit Sharing, Commissio...","[$44k - $70k, $517 - $6k, $491 - $5k, $457 - $...","Based on 14,458 salary profiles (last updated ...",3.8 out of 5,"(1,476)","[Female, Male, Prefer to self-define]","[63.8%, 35.2%, 0.9%]","This data is based on 5,445 survey responses. ..."
1,https://www.payscale.com/research/US/Job/Accou...,https://www.payscale.com/research/US/Job=Finan...,Average Financial Analyst Salary,"$66,222",/ year,"[Entry Level, Early Career, Mid Career, Late C...","[▼10%, ▼1%, ▲11%, ▲17%, ▲21%]","[Base Salary, Bonus, Profit Sharing, Commissio...","[$51k - $88k, $1k - $10k, $752 - $10k, $5 - $3...","Based on 14,267 salary profiles (last updated ...",3.8 out of 5,"(1,707)","[Male, Female, Prefer to self-define]","[58.6%, 40.5%, 0.8%]","This data is based on 5,560 survey responses. ..."
2,https://www.payscale.com/research/US/Job/Accou...,https://www.payscale.com/research/US/Job=Senio...,Average Senior Accountant Salary,"$73,492",/ year,"[Entry Level, Early Career, Mid Career, Late C...","[▼12%, ▼4%, ▲2%, ▲4%, ▲2%]","[Base Salary, Bonus, Profit Sharing, Commissio...","[$57k - $92k, $1k - $10k, $821 - $7k, $100 - $...","Based on 12,554 salary profiles (last updated ...",3.8 out of 5,"(1,375)","[Female, Male, Prefer to self-define]","[65.3%, 33.6%, 1.1%]","This data is based on 4,777 survey responses. ..."
3,https://www.payscale.com/research/US/Job/Accou...,https://www.payscale.com/research/US/Job=Accou...,Average Accountant Salary,"$56,610",/ year,"[Entry Level, Early Career, Mid Career, Late C...","[▼10%, ▼6%, ▲7%, ▲9%, ▲13%]","[Base Salary, Bonus, Profit Sharing, Commissio...","[$44k - $76k, $516 - $7k, $395 - $7k, $505 - $...","Based on 13,328 salary profiles (last updated ...",3.8 out of 5,"(1,350)","[Female, Male, Prefer to self-define]","[68.3%, 30.7%, 1.0%]","This data is based on 4,631 survey responses. ..."
4,https://www.payscale.com/research/US/Job/Accou...,https://www.payscale.com/research/US/Job=Finan...,Average Financial Controller Salary,"$91,776",/ year,"[Entry Level, Early Career, Mid Career, Late C...","[▼27%, ▼15%, ▼2%, ▲6%, ▲5%]","[Base Salary, Bonus, Profit Sharing, Commissio...","[$63k - $133k, $2k - $22k, $1k - $14k, $935 - ...","Based on 11,582 salary profiles (last updated ...",4.1 out of 5,"(1,210)","[Female, Male, Prefer to self-define]","[65.1%, 33.9%, 0.9%]","This data is based on 4,632 survey responses. ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2811,https://www.payscale.com/research/US/Job/Trans...,https://www.payscale.com/research/US/Job=Logis...,Average Logistics Management Specialist Salary,"$73,905",/ year,"[Entry Level, Early Career, Mid Career, Late C...","[▼31%, ▼22%, ▼11%, ▲3%, ▲17%]","[Base Salary, Bonus, Profit Sharing, Total Pay]","[$50k - $109k, $603 - $9k, $0 - $393, $49k - $...",Based on 173 salary profiles (last updated Nov...,4.2 out of 5,(25),"[Male, Female, Prefer to self-define]","[65.6%, 31.1%, 3.3%]",This data is based on 61 survey responses. Lea...
2812,https://www.payscale.com/research/US/Job/Trans...,https://www.payscale.com/research/US/Job=Packe...,"Average Packer / Packager, Hand Hourly Pay",$15.16,/ hour,"[Entry Level, Early Career, Mid Career, Late C...","[▼16%, ▼1%, ▼3%, ▼4%, ▲10%]","[Hourly Rate, Bonus, Profit Sharing, Total Pay]","[$11 - $19, $20 - $26k, $0 - $59, $24k - $40k]",Based on 173 salary profiles (last updated Nov...,3 out of 5,(24),"[Female, Male, Prefer to self-define]","[48.0%, 48.0%, 4.0%]",This data is based on 50 survey responses. Lea...
2813,https://www.payscale.com/research/US/Job/Trans...,https://www.payscale.com/research/US/Job=Concr...,Average Concrete Pump Operator Hourly Pay,$26.04,/ hour,"[Entry Level, Early Career, Mid Career, Late C...","[▼26%, ▼10%, ▼1%, ▼0%, ▲12%]","[Hourly Rate, Bonus, Profit Sharing, Total Pay]","[$19 - $35, $199 - $7k, $0 - $5k, $40k - $75k]",Based on 177 salary profiles (last updated Oct...,3.8 out of 5,(20),[Male],[100.0%],This data is based on 43 survey responses. Lea...
2814,https://www.payscale.com/research/US/Job/Trans...,https://www.payscale.com/research/US/Job=Fleet...,Average Fleet Supervisor Salary,"$64,924",/ year,"[Early Career, Mid Career, Late Career, Experi...","[▼5%, ▲1%, ▼3%, ▲15%]","[Base Salary, Bonus, Profit Sharing, Total Pay]","[$50k - $84k, $2k - $13k, $1k - $5k, $46k - $87k]",Based on 163 salary profiles (last updated Oct...,3.8 out of 5,(24),"[Male, Female]","[84.4%, 15.6%]",This data is based on 77 survey responses. Lea...


In [49]:
# export to csv
df.to_csv('output_v3.csv', index=False)