# Indeed Job Scraper

The tobias-fyi fork of the repository can be found here: [tobias-fyi/Indeed-Job-Scraper](https://github.com/tobias-fyi/Indeed-Job-Scraper/)

---
---

In [1]:
# ====== Imports ====== #
from bs4 import BeautifulSoup
import requests, json
import pandas as pd
from multiprocessing import Pool
from functools import partial

In [21]:
# ====== Scraping parameters ====== #
parameters = {
    "search_query": "data science",
    "location": "Denver, CO",
    "miles": 50,
    "ordered_keywords": "data",
    "exclude_keywords": "",
    "title_keywords": "",
    "pages": 1,
}

In [14]:
# ====== Class for scraping Indeed ====== #
# ======   With multiprocessing    ====== #

class Scrape:
    def __init__(self, parameters):
        self.output_frame = None
        self.loading = False
        
        # Retain parameter dict
        self.parameters = parameters

        # Create base Indeed URL for all further scraping
        self.what = parameters["search_query"]
        self.where = parameters["location"]
        self.miles = parameters["miles"]
        self.base_url = f"https://www.indeed.com/jobs?q={self.what}&l={self.where}"
        
        # Set other attributes
        self.keywords = parameters["ordered_keywords"]
        self.title_keywords = parameters["title_keywords"]
        self.exclude_keywords = parameters["exclude_keywords"]
        self.pages = parameters["pages"]

        self.total_keywords = len(self.keywords) + len(self.title_keywords)

    def rate_job(self, j_title, j_soup):
        """Rate job based on input parameters."""

        description = j_soup.find(id="jobDescriptionText").get_text()

        # Reset the params
        keywords_present = []
        title_keywords_present = []
        rating = 0

        # Check for keyword, add value to rating depending on ranking
        for index, keyword in enumerate(self.keywords):
            if keyword in description:
                rating += len(keywords) - index
                keywords_present.append(keyword)

        # Check for title keywords
        for index, keyword in enumerate(self.title_keywords):
            if keyword in j_title:
                rating += total_keywords - index
                title_keywords_present.append(keyword)

        # Normalise rating
        rating = rating / sum(range(1, self.total_keywords + 1))

        # Check for excluded keywords
        for keyword in self.exclude_keywords:
            if keyword in j_title:
                rating = 0
                break

        return description, rating, keywords_present, title_keywords_present


    def get_job_details(self, job):
        """Obtain details of the job (company, title, description etc.)"""

        # Get link and title
        job_url = job.find(class_="title").a["href"]

        job_url = ( # Correct for truncated URLs
            "https://www.indeed.com" + job_url if (job_url.startswith("/")) else job_url
        )
        job_page = requests.get(job_url)
        job_soup = BeautifulSoup(job_page.content, "html.parser")

        # Give URL after redirect (ads/analytics etc.)
        job_url = job_page.url

        # Get job title and company name
        title = job.find(class_="title").a["title"]
        company = job_soup.find(class_="icl-u-lg-mr--sm").get_text()

        # Get description, rating and present keywords
        description, rating, keywords_present, title_keywords_present = self.rate_job(
            title, job_soup
        )

        return (
            title,
            company,
            job_url,
            description,
            rating,
            keywords_present,
            title_keywords_present,
        )

    def parallel_scrape(self, parameters, url, page_num):
        """Parallel scraping routine. Run through MapPool using Multiprocessing library"""
        
        # Get page
        current_page = requests.get(url, timeout=5)
        page_soup = BeautifulSoup(current_page.content, "html.parser")
        page_output = []

        # Parse every job in page
        for job in page_soup.select(".jobsearch-SerpJobCard"):

            (
                title,
                company,
                url,
                description,
                rating,
                keywords_present,
                title_keywords_present,
            ) = self.get_job_details(job)

            page_output.append(
                [
                    rating,
                    title,
                    company,
                    description,
                    url,
                    str(keywords_present),
                    str(title_keywords_present),
                    page_num,
                ]
            )

        return page_output

    def get_scrape(self):
        """Primary method for obtaining scraped jobs."""

        # Reset output and progress
        self.loading = True

        # Output list and frame
        output = []

        # Create pool of workers
        pool = Pool(min(self.pages, 5))

        # Dirty list comprehension to create argument list for pool workers
        pool_args = [
            (self.base_url + "&start=" + str(x * 10), x + 1)
            if (x != 0)
            else (self.base_url, x + 1)
            for x in range(0, self.pages)
        ]

        # Get output of pool workers
        output = pool.starmap(partial(self.parallel_scrape, self.parameters), pool_args)
        output = [x for sublist in output for x in sublist]

        # Create dataframe from list of jobs
        df_output_frame = (
            pd.DataFrame(
                output,
                columns=[
                    "Rating",
                    "Job Title",
                    "Company",
                    "Description",
                    "Job URL",
                    "Keywords Present",
                    "Title Keywords",
                    "Page Found",
                ],
            )
            .sort_values(by="Rating", ascending=False)
            .reset_index(drop=True)
        )

        # Sort df by rating
        df_output_frame["Rating"] = df_output_frame["Rating"].round(decimals=3)
        df_output_frame = df_output_frame.drop_duplicates(
            subset=["Rating", "Job Title", "Company"]
        )
        self.loading = False

        return df_output_frame

    # For outputting to csv locally
    def output_csv(self, df):
        df.to_csv("indeed_job_scraper.csv", index=False)

In [25]:
# ====== Class for scraping Indeed ====== #
# ======  Without multiprocessing  ====== #

class Scrape:
    def __init__(self, parameters):
        self.output_frame = None
        self.loading = False
        
        # Retain parameter dict
        self.parameters = parameters

        # Create base Indeed URL for all further scraping
        self.what = parameters["search_query"]
        self.where = parameters["location"]
        self.miles = parameters["miles"]
        self.base_url = f"https://www.indeed.com/jobs?q={self.what}&l={self.where}"
        
        # Set other attributes
        self.keywords = parameters["ordered_keywords"]
        self.title_keywords = parameters["title_keywords"]
        self.exclude_keywords = parameters["exclude_keywords"]
        self.pages = parameters["pages"]

        self.total_keywords = len(self.keywords) + len(self.title_keywords)

    def rate_job(self, j_title, j_soup):
        """Rate job based on input parameters."""

        description = j_soup.find(id="jobDescriptionText").get_text()

        # Reset the params
        keywords_present = []
        title_keywords_present = []
        rating = 0

        # Check for keyword, add value to rating depending on ranking
        for index, keyword in enumerate(self.keywords):
            if keyword in description:
                rating += len(self.keywords) - index
                keywords_present.append(keyword)

        # Check for title keywords
        for index, keyword in enumerate(self.title_keywords):
            if keyword in j_title:
                rating += self.total_keywords - index
                title_keywords_present.append(keyword)

        # Normalise rating
        rating = rating / sum(range(1, self.total_keywords + 1))

        # Check for excluded keywords
        for keyword in self.exclude_keywords:
            if keyword in j_title:
                rating = 0
                break

        return description, rating, keywords_present, title_keywords_present


    def get_job_details(self, job):
        """Obtain details of the job (company, title, description etc.)"""

        # Get link and title
        job_url = job.find(class_="title").a["href"]

        job_url = ( # Correct for truncated URLs
            "https://www.indeed.com" + job_url if (job_url.startswith("/")) else job_url
        )
        job_page = requests.get(job_url)
        job_soup = BeautifulSoup(job_page.content, "html.parser")

        # Give URL after redirect (ads/analytics etc.)
        job_url = job_page.url

        # Get job title and company name
        title = job.find(class_="title").a["title"]
        company = job_soup.find(class_="icl-u-lg-mr--sm").get_text()

        # Get description, rating and present keywords
        description, rating, keywords_present, title_keywords_present = self.rate_job(
            title, job_soup
        )

        return (
            title,
            company,
            job_url,
            description,
            rating,
            keywords_present,
            title_keywords_present,
        )

    def get_scrape(self):
        """Primary method for obtaining scraped jobs."""

        # Reset output and progress
        self.loading = True

        # Output list and frame
        output = []

        for x in range(0, self.pages):
            if x == 0:
                page_append = ""
            else:
                page_append = "&start=" + str(x * 10)

            # get page
            current_page = requests.get(self.base_url + page_append, timeout=5)
            page_soup = BeautifulSoup(current_page.content, "html.parser")

            for job in page_soup.select(".jobsearch-SerpJobCard"):
                (
                    title,
                    company,
                    url,
                    description,
                    rating,
                    keywords_present,
                    title_keywords_present,
                ) = self.get_job_details(job)
                output.append(
                    [
                        rating,
                        title,
                        company,
                        description,
                        url,
                        keywords_present,
                        title_keywords_present,
                        x + 1,
                    ]
                )

            print(f"Page {x+1} completed", end="\r")

        # Create dataframe from list of jobs
        df_output_frame = (
            pd.DataFrame(
                output,
                columns=[
                    "Rating",
                    "Job Title",
                    "Company",
                    "Description",
                    "Job URL",
                    "Keywords Present",
                    "Title Keywords",
                    "Page Found",
                ],
            )
            .sort_values(by="Rating", ascending=False)
            .reset_index(drop=True)
        )

        # Sort df by rating
        df_output_frame["Rating"] = df_output_frame["Rating"].round(decimals=3)
        df_output_frame = df_output_frame.drop_duplicates(
            subset=["Rating", "Job Title", "Company"]
        )
        self.loading = False

        return df_output_frame

    # For outputting to csv locally
    def output_csv(self, df):
        df.to_csv("indeed_job_scraper.csv", index=False)

## Time to let 'er rip!

In [26]:
# First, instantiate a scraper object with the parameters
scraper = Scrape(parameters)

In [27]:
# Then, use the `.get_scrape()` method to use the instance of the scraper
df1 = scraper.get_scrape()

Page 1 completed

In [33]:
print(df1.shape)
df1.head()

(16, 8)


Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,1.0,Data Engineer,Seen by Indeed,Seen by Indeed is a free service that connects...,https://www.indeed.com/viewjob?jk=790e2b4b32f9...,"[d, a, t, a]",[],1
1,1.0,Data Scientist,Aegis Premier Technologies,About Us We are a group of passionate technolo...,https://www.indeed.com/viewjob?cmp=Aegis-Premi...,"[d, a, t, a]",[],1
2,1.0,Data Warehouse Engineer,Seen by Indeed,Seen by Indeed is a free service that connects...,https://www.indeed.com/viewjob?jk=2246c6b11155...,"[d, a, t, a]",[],1
3,1.0,Data Scientist,Merkle Inc.,"Job Description\n\nDesign, develop, test, depl...",https://www.indeed.com/viewjob?jk=9c9be388d642...,"[d, a, t, a]",[],1
4,1.0,Data Scientist,Deloitte,Deloitte Services LP includes internal support...,https://www.indeed.com/viewjob?jk=1f37f96bae2b...,"[d, a, t, a]",[],1


In [36]:
# Save to csv
df1.to_csv("indeed_1pg_test.csv", index=False)

---

## The Full Monty (Python)

Now I'm going to really let 'er rip to get the full dataset. A little edit to show me what page is being processed.

In [51]:
# ====== Class for scraping Indeed ====== #
# ======  Without multiprocessing  ====== #

class Scrape:
    def __init__(self, parameters):
        self.output_frame = None
        self.loading = False
        
        # Retain parameter dict
        self.parameters = parameters

        # Create base Indeed URL for all further scraping
        self.what = parameters["search_query"]
        self.where = parameters["location"]
        self.miles = parameters["miles"]
        self.base_url = f"https://www.indeed.com/jobs?q={self.what}&l={self.where}"
        
        # Set other attributes
        self.keywords = parameters["ordered_keywords"]
        self.title_keywords = parameters["title_keywords"]
        self.exclude_keywords = parameters["exclude_keywords"]
        self.pages = parameters["pages"]

        self.total_keywords = len(self.keywords) + len(self.title_keywords)

    def rate_job(self, j_title, j_soup):
        """Rate job based on input parameters."""

        description = j_soup.find(id="jobDescriptionText").get_text()

        # Reset the params
        keywords_present = []
        title_keywords_present = []
        rating = 0

        # Check for keyword, add value to rating depending on ranking
        for index, keyword in enumerate(self.keywords):
            if keyword in description:
                rating += len(self.keywords) - index
                keywords_present.append(keyword)

        # Check for title keywords
        for index, keyword in enumerate(self.title_keywords):
            if keyword in j_title:
                rating += self.total_keywords - index
                title_keywords_present.append(keyword)

        # Normalise rating
        rating = rating / sum(range(1, self.total_keywords + 1))

        # Check for excluded keywords
        for keyword in self.exclude_keywords:
            if keyword in j_title:
                rating = 0
                break

        return description, rating, keywords_present, title_keywords_present


    def get_job_details(self, job):
        """Obtain details of the job (company, title, description etc.)"""

        # Get link and title
        job_url = job.find(class_="title").a["href"]

        job_url = ( # Correct for truncated URLs
            "https://www.indeed.com" + job_url if (job_url.startswith("/")) else job_url
        )
        job_page = requests.get(job_url)
        job_soup = BeautifulSoup(job_page.content, "html.parser")

        # Give URL after redirect (ads/analytics etc.)
        job_url = job_page.url

        # Get job title and company name
        try:
            title = job.find(class_="title").a["title"]
        except:
            title = "No title found"
        
        try:
            company = job_soup.find(class_="icl-u-lg-mr--sm").get_text()
        except:
            company = "No company found."
            

        # Get description, rating and present keywords
        description, rating, keywords_present, title_keywords_present = self.rate_job(
            title, job_soup
        )

        return (
            title,
            company,
            job_url,
            description,
            rating,
            keywords_present,
            title_keywords_present,
        )

    def get_scrape(self):
        """Primary method for obtaining scraped jobs."""

        # Reset output and progress
        self.loading = True

        # Output list and frame
        output = []

        for x in range(0, self.pages):
            print(f"Processing page {x + 1}...")
            
            if x == 0:
                page_append = ""
            else:
                page_append = "&start=" + str(x * 10)

            # get page
            current_page = requests.get(self.base_url + page_append, timeout=5)
            page_soup = BeautifulSoup(current_page.content, "html.parser")

            for job in page_soup.select(".jobsearch-SerpJobCard"):
                (
                    title,
                    company,
                    url,
                    description,
                    rating,
                    keywords_present,
                    title_keywords_present,
                ) = self.get_job_details(job)
                output.append(
                    [
                        rating,
                        title,
                        company,
                        description,
                        url,
                        keywords_present,
                        title_keywords_present,
                        x + 1,
                    ]
                )

            print(f"Page {x+1} completed", end="\r")

        # Create dataframe from list of jobs
        df_output_frame = (
            pd.DataFrame(
                output,
                columns=[
                    "Rating",
                    "Job Title",
                    "Company",
                    "Description",
                    "Job URL",
                    "Keywords Present",
                    "Title Keywords",
                    "Page Found",
                ],
            )
            .sort_values(by="Rating", ascending=False)
            .reset_index(drop=True)
        )

        # Sort df by rating
        df_output_frame["Rating"] = df_output_frame["Rating"].round(decimals=3)
        df_output_frame = df_output_frame.drop_duplicates(
            subset=["Rating", "Job Title", "Company"]
        )
        self.loading = False

        return df_output_frame

    # For outputting to csv locally
    def output_csv(self, df):
        df.to_csv("indeed_job_scraper.csv", index=False)

In [52]:
# ====== Scraping parameters ====== #
parameters2 = {
    "search_query": "data science",
    "location": "Denver, CO",
    "miles": 50,
    "ordered_keywords": ["data", "science"],
    "exclude_keywords": "",
    "title_keywords": "",
    "pages": 15,
}

In [53]:
# First, instantiate a scraper object with the parameters
scraper2 = Scrape(parameters2)

In [54]:
# Then, use the `.get_scrape()` method to use the instance of the scraper
df2 = scraper2.get_scrape()

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Page 15 completed

In [55]:
print(df2.shape)
df2.head()

(118, 8)


Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,1.0,Lead Geospatial Data Scientist,Cloud Agronomics,About Us:\nCloud Ag is an AgTech startup apply...,https://www.indeed.com/viewjob?jk=9fef1c79148f...,"[data, science]",[],14
1,1.0,Data Engineer,Valen Analytics,Join a high performing and rapidly growing tea...,https://www.indeed.com/viewjob?jk=1489facad7d0...,"[data, science]",[],6
2,1.0,US Contractor - Data Engineer (Big Data),GHX,We work to solve deep technical problems that ...,https://www.indeed.com/viewjob?jk=6f750b0761e3...,"[data, science]",[],15
3,1.0,Financial Analyst Intern,ViaSat,Bigger challenges. Bolder ideas. Global impact...,https://www.indeed.com/viewjob?jk=5f89afd9020e...,"[data, science]",[],12
4,1.0,Data Scientist - Behavioral Health,Denver Health,"Job Summary:\n\nUnder minimal supervision, sup...",https://www.indeed.com/viewjob?jk=508970a4e047...,"[data, science]",[],7


In [56]:
# Save to csv
df2.to_csv("19-12-03-indeed_jobs_pg_1-15.csv", index=False)

---

## The Full Monty (Python), Part 2

That gave me 118 records. I didn't want to accidentally overshoot the results and have to start back from page 1 after scraping all of that. So I'm going to set the next one to start scraping near where that one left off.

In [60]:
# ====== Class for scraping Indeed ====== #
# ======  Without multiprocessing  ====== #

# The update to this version of the class is to start it at page 16

class Scrape:
    def __init__(self, parameters):
        self.output_frame = None
        self.loading = False
        
        # Retain parameter dict
        self.parameters = parameters

        # Create base Indeed URL for all further scraping
        self.what = parameters["search_query"]
        self.where = parameters["location"]
        self.miles = parameters["miles"]
        self.base_url = f"https://www.indeed.com/jobs?q={self.what}&l={self.where}"
        
        # Set other attributes
        self.keywords = parameters["ordered_keywords"]
        self.title_keywords = parameters["title_keywords"]
        self.exclude_keywords = parameters["exclude_keywords"]
        self.pages = parameters["pages"]

        self.total_keywords = len(self.keywords) + len(self.title_keywords)

    def rate_job(self, j_title, j_soup):
        """Rate job based on input parameters."""

        description = j_soup.find(id="jobDescriptionText").get_text()

        # Reset the params
        keywords_present = []
        title_keywords_present = []
        rating = 0

        # Check for keyword, add value to rating depending on ranking
        for index, keyword in enumerate(self.keywords):
            if keyword in description:
                rating += len(self.keywords) - index
                keywords_present.append(keyword)

        # Check for title keywords
        for index, keyword in enumerate(self.title_keywords):
            if keyword in j_title:
                rating += self.total_keywords - index
                title_keywords_present.append(keyword)

        # Normalise rating
        rating = rating / sum(range(1, self.total_keywords + 1))

        # Check for excluded keywords
        for keyword in self.exclude_keywords:
            if keyword in j_title:
                rating = 0
                break

        return description, rating, keywords_present, title_keywords_present


    def get_job_details(self, job):
        """Obtain details of the job (company, title, description etc.)"""

        # Get link and title
        job_url = job.find(class_="title").a["href"]

        job_url = ( # Correct for truncated URLs
            "https://www.indeed.com" + job_url if (job_url.startswith("/")) else job_url
        )
        job_page = requests.get(job_url)
        job_soup = BeautifulSoup(job_page.content, "html.parser")

        # Give URL after redirect (ads/analytics etc.)
        job_url = job_page.url

        # Get job title and company name
        try:
            title = job.find(class_="title").a["title"]
        except:
            title = "No title found"
        
        try:
            company = job_soup.find(class_="icl-u-lg-mr--sm").get_text()
        except:
            company = "No company found."
            

        # Get description, rating and present keywords
        description, rating, keywords_present, title_keywords_present = self.rate_job(
            title, job_soup
        )

        return (
            title,
            company,
            job_url,
            description,
            rating,
            keywords_present,
            title_keywords_present,
        )

    def get_scrape(self):
        """Primary method for obtaining scraped jobs."""

        # Reset output and progress
        self.loading = True

        # Output list and frame
        output = []

        # This time, start at page 15
        for x in range(15, self.pages + 15):
            print(f"Processing page {x + 1}...")
            
            if x == 0:
                page_append = ""
            else:
                page_append = "&start=" + str(x * 10)

            # get page
            current_page = requests.get(self.base_url + page_append, timeout=5)
            page_soup = BeautifulSoup(current_page.content, "html.parser")

            for job in page_soup.select(".jobsearch-SerpJobCard"):
                (
                    title,
                    company,
                    url,
                    description,
                    rating,
                    keywords_present,
                    title_keywords_present,
                ) = self.get_job_details(job)
                output.append(
                    [
                        rating,
                        title,
                        company,
                        description,
                        url,
                        keywords_present,
                        title_keywords_present,
                        x + 1,
                    ]
                )

            print(f"Page {x+1} completed", end="\r")

        # Create dataframe from list of jobs
        df_output_frame = (
            pd.DataFrame(
                output,
                columns=[
                    "Rating",
                    "Job Title",
                    "Company",
                    "Description",
                    "Job URL",
                    "Keywords Present",
                    "Title Keywords",
                    "Page Found",
                ],
            )
            .sort_values(by="Rating", ascending=False)
            .reset_index(drop=True)
        )

        # Sort df by rating
        df_output_frame["Rating"] = df_output_frame["Rating"].round(decimals=3)
        df_output_frame = df_output_frame.drop_duplicates(
            subset=["Rating", "Job Title", "Company"]
        )
        self.loading = False

        return df_output_frame

    # For outputting to csv locally
    def output_csv(self, df):
        df.to_csv("indeed_job_scraper.csv", index=False)

In [61]:
# ====== Scraping parameters ====== #
parameters3 = {
    "search_query": "data science",
    "location": "Denver, CO",
    "miles": 50,
    "ordered_keywords": ["data", "science"],
    "exclude_keywords": "",
    "title_keywords": "",
    "pages": 10,
}

In [62]:
# First, instantiate a scraper object with the parameters
scraper3 = Scrape(parameters3)

In [63]:
# Then, use the `.get_scrape()` method to use the instance of the scraper
df3 = scraper3.get_scrape()

Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Page 25 completed

In [64]:
print(df3.shape)
df3.head()

(88, 8)


Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,1.0,Data Engineer,Valen Analytics,Join a high performing and rapidly growing tea...,https://www.indeed.com/viewjob?jk=1489facad7d0...,"[data, science]",[],24
1,1.0,Data Scientist,Aegis Premier Technologies,About Us We are a group of passionate technolo...,https://www.indeed.com/viewjob?cmp=Aegis-Premi...,"[data, science]",[],22
2,1.0,Data Engineer,Code42,WHAT YOU’LL BE DOING:\nCode42 is looking for a...,https://www.indeed.com/viewjob?jk=95e4b406998a...,"[data, science]",[],20
3,1.0,Data Scientist,Fluid Truck Share,Fluid is a peer-to-peer community truck sharin...,https://www.indeed.com/viewjob?jk=7c409ecd5a55...,"[data, science]",[],22
4,1.0,"Comcast Busines Analyst 3, Enterprise Data & A...",Comcast,Comcast Business offers technology solutions r...,https://www.indeed.com/viewjob?jk=b4048e3b393a...,"[data, science]",[],20


In [67]:
# For comparison
df2.head()

Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,1.0,Lead Geospatial Data Scientist,Cloud Agronomics,About Us:\nCloud Ag is an AgTech startup apply...,https://www.indeed.com/viewjob?jk=9fef1c79148f...,"[data, science]",[],14
1,1.0,Data Engineer,Valen Analytics,Join a high performing and rapidly growing tea...,https://www.indeed.com/viewjob?jk=1489facad7d0...,"[data, science]",[],6
2,1.0,US Contractor - Data Engineer (Big Data),GHX,We work to solve deep technical problems that ...,https://www.indeed.com/viewjob?jk=6f750b0761e3...,"[data, science]",[],15
3,1.0,Financial Analyst Intern,ViaSat,Bigger challenges. Bolder ideas. Global impact...,https://www.indeed.com/viewjob?jk=5f89afd9020e...,"[data, science]",[],12
4,1.0,Data Scientist - Behavioral Health,Denver Health,"Job Summary:\n\nUnder minimal supervision, sup...",https://www.indeed.com/viewjob?jk=508970a4e047...,"[data, science]",[],7


In [65]:
# Save to csv
df3.to_csv("19-12-03-indeed_jobs_pg_16-25.csv", index=False)

---

## Concatenation

It looks like there were some repeats. We shall see how many once I concatenate the dataframes and remove the duplicates.

In [68]:
# Might as well concatenate all three, in case they all have some uniques
# Put dfs into list and run concat on the list
frames = [df1, df2, df3]

df4 = pd.concat(frames)

In [69]:
# Look at the results
print(df4.shape)
df4.head()

(222, 8)


Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,1.0,Data Engineer,Seen by Indeed,Seen by Indeed is a free service that connects...,https://www.indeed.com/viewjob?jk=790e2b4b32f9...,"[d, a, t, a]",[],1
1,1.0,Data Scientist,Aegis Premier Technologies,About Us We are a group of passionate technolo...,https://www.indeed.com/viewjob?cmp=Aegis-Premi...,"[d, a, t, a]",[],1
2,1.0,Data Warehouse Engineer,Seen by Indeed,Seen by Indeed is a free service that connects...,https://www.indeed.com/viewjob?jk=2246c6b11155...,"[d, a, t, a]",[],1
3,1.0,Data Scientist,Merkle Inc.,"Job Description\n\nDesign, develop, test, depl...",https://www.indeed.com/viewjob?jk=9c9be388d642...,"[d, a, t, a]",[],1
4,1.0,Data Scientist,Deloitte,Deloitte Services LP includes internal support...,https://www.indeed.com/viewjob?jk=1f37f96bae2b...,"[d, a, t, a]",[],1


In [72]:
# Clean it up using pyjanitor
import janitor
df5 = (df4
       .clean_names()  # Fixes capitalization, whitespace, extra characters
       .remove_columns(column_names=["rating", "title_keywords"])  # Remove useless columns
       # I'll leave "keywords_present in for now"
      )

df5.head()

Unnamed: 0,job_title,company,description,job_url,keywords_present,page_found
0,Data Engineer,Seen by Indeed,Seen by Indeed is a free service that connects...,https://www.indeed.com/viewjob?jk=790e2b4b32f9...,"[d, a, t, a]",1
1,Data Scientist,Aegis Premier Technologies,About Us We are a group of passionate technolo...,https://www.indeed.com/viewjob?cmp=Aegis-Premi...,"[d, a, t, a]",1
2,Data Warehouse Engineer,Seen by Indeed,Seen by Indeed is a free service that connects...,https://www.indeed.com/viewjob?jk=2246c6b11155...,"[d, a, t, a]",1
3,Data Scientist,Merkle Inc.,"Job Description\n\nDesign, develop, test, depl...",https://www.indeed.com/viewjob?jk=9c9be388d642...,"[d, a, t, a]",1
4,Data Scientist,Deloitte,Deloitte Services LP includes internal support...,https://www.indeed.com/viewjob?jk=1f37f96bae2b...,"[d, a, t, a]",1


In [73]:
# Now let's find out how many duplicates there are
# If they are duplicates, the Description should be exactly the same
df6 = df5.drop_duplicates(subset=["description"])
df6.shape

(177, 6)

In [75]:
# Looks like there were 45 duplicates
df5.shape[0] - df6.shape[0]

45

In [77]:
# Save it again
df6.to_csv("19-12-03-indeed_jobs_pg_1-25.csv", index=False)

---

### "Seen by Indeed"

I noticed there are some where the company name didn't get scraped correctly, and the record says "Seen by Indeed".

Before I do anything about those, I just want to see how many of these records exist.

In [78]:
df6.select_dtypes(exclude="number").describe().T.sort_values(by="unique")

Unnamed: 0,count,unique,top,freq
keywords_present,177,5,"[data, science]",88
company,177,113,No company found.,9
job_title,177,123,Data Scientist,26
description,177,177,CLEARANCE IS REQUIRED!\n\nRelocation is not a ...,1
job_url,177,177,https://www.indeed.com/viewjob?jk=a4abb701f0d9...,1


In [79]:
df6["company"].value_counts()

No company found.                       9
Deloitte                                6
CenturyLink                             5
Booz Allen Hamilton                     5
xentity corporation                     4
LOCKHEED MARTIN CORPORATION             4
Ball Aerospace                          3
Aegon                                   3
Pearson                                 3
University of Colorado Boulder          3
CBRE                                    3
Comcast                                 3
University of Colorado                  2
Decentrix                               2
Fluid Truck Share                       2
ViaSat                                  2
Transamerica                            2
CitiusTech                              2
JumpCloud                               2
Dataiku                                 2
Cloud Agronomics                        2
Recurly                                 2
DISH                                    2
Horizontal                        

In [81]:
# Actually only 1 of them!
df6["company"].value_counts()["Seen by Indeed"]

1

> That is actually much better than I thought.

There are only 9 companies for which the name wasn't scraped correctly, and it looks like 1 that is "Seen by Indeed".

The incorrectly scraped names can be dropped, then the dataset will be ready to go for NLP!

In [123]:
df7 = df6[df6["company"] != "Seen by Indeed"]
df7.shape

(176, 6)

In [125]:
df7.dtypes

job_title            object
company              object
description          object
job_url              object
keywords_present    float64
page_found            int64
dtype: object

In [126]:
# Strip whitespace from all of the columns data
for col in df7.columns:
    if df7[col].dtype == "object":
        df7[col] = df7[col].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [127]:
pd.options.display.max_colwidth = 200
df6[df6["company"] == "No company found."]

Unnamed: 0,job_title,company,description,job_url,keywords_present,page_found
28,Associate Research Science Analyst,No company found.,"About GutCheck\n--------------\n\nAt GutCheck, we pioneered agile market research to provide our clients with actionable answers and insights, globally, at the speed of their business. We believe ...",https://www.indeed.com/viewjob?jk=9fcf95972fa1477c&from=serp&vjs=3,,6
83,Data Engineer (Mid and Senior),No company found.,"Data Engineer, Mid to Senior Level\nDatalere team members lead by example, focus on customer needs and have a thirst to learn all they can about data analytics. Successful candidates are self-star...",https://www.indeed.com/viewjob?jk=ab2f7b23069fe431&from=serp&vjs=3,,12
122,Data Scientist for Materials Science,No company found.,"Posting Title\nData Scientist for Materials Science\n.\nLocation\nCO - Golden\n.\nPosition Type\nLimited Term (Fixed Term)\n.\nHours Per Week\n40\n.\nJob Description\nThe Data, Analysis and Visual...",https://www.indeed.com/viewjob?jk=d827816a6c40638f&from=serp&vjs=3,,3
135,Analyst,No company found.,"Reach Your Peak at Vail Resorts. You're someone who pushes boundaries and challenges the status quo. You're brave, ambitious and passionate in everything you do. And we want you on our team. Pursu...",https://www.indeed.com/viewjob?jk=6d05ab5d579549e2&from=serp&vjs=3,,5
156,Data Scientist,No company found.,"(6605)\n\nData scientists use data and analytical ability to find and interpret rich data sources; manage large amounts of data despite hardware, software, and bandwidth constraints; merge data so...",https://www.indeed.com/viewjob?jk=6543e0fe2cc49c33&tk=1dr7il64533g4001&from=serp&vjs=3&advn=2509720981551603&adid=325531189&sjdu=i6xVERweJM_pVUvgf-Mzud2KHack-wE2Lb7KQz0L0LaB6A3rKzwVBeWWBkx9kBdQ,,12
169,Python Data Engineer,No company found.,"As a data engineer, your general responsibilities will include bolting together the core components of our platform that allow us to interact with utility company data stores, helping build system...",https://www.indeed.com/viewjob?jk=7d521d3b4c8d8915&from=serp&vjs=3,,13
55,Senior Data Engineer,No company found.,"Company Overview\n\nFanatics is the global leader in licensed sports merchandise and changing the way fans purchase their favorite team apparel and jerseys. Through an innovative, tech-infused app...",https://www.indeed.com/viewjob?jk=14d283ce293fc6eb&from=serp&vjs=3,,17
79,Data Scientist Snr Manager,No company found.,Data Scientist Snr Manager-19001924\n\n\nPreferred Qualifications\n\nJob Description:\n\nThe Oracle Data Cloud is on the cutting edge in development of measurement and targeting methodologies for ...,https://www.indeed.com/viewjob?jk=0dc435180e844868&from=serp&vjs=3,,18
108,Data Intelligence/Analytics Engineer,No company found.,"Job Title: Data Intelligence / Analytics Engineer Company Summary Logical Systems (LSI) LLC, is an established multi-disciplinary engineering company founded in Memphis, Tennessee in 1985. Today, ...",https://www.indeed.com/viewjob?cmp=Logical-Systems-Incorporated&t=Data+Intelligence+Analytic+Engineer&jk=8a02e5b9dbc466ef&vjs=3,,16


In [131]:
df7.iloc[28]["company"]

'No company found.'

In [132]:
# Replace the missing values by hand
df7.at[28, "company"] = "GutCheck"

In [133]:
df7.iloc[28]["company"]

'GutCheck'

In [155]:
def replace_value(df, index: int, column: str, value: str):
    """Replaces the value at index, column with value."""
    print(df.iloc[index][column])  # Before
    df.at[index, column] = value
    print(df.iloc[index][column])  # After

In [148]:
# The index numbers are off, so resetting them
df8 = df7.reset_index()
df8.head()

Unnamed: 0,index,job_title,company,description,job_url,keywords_present,page_found
0,1,Data Scientist,Aegis Premier Technologies,"About Us We are a group of passionate technology professionals who are committed to delivering best-in-class data, data science and software solutions for the non-profit space. We care deeply abou...",https://www.indeed.com/viewjob?cmp=Aegis-Premier-Technologies&t=Data+Scientist&jk=d8891916469e042c&sjdu=QwrRXKrqZ3CNX5W-O9jEveJgZ7DRbh_ySwPONsqRa9ZW4N-0leXNJ1nUuiJ-kR7vVSLM815TvZxPISFQtteDJw&adid=...,,1
1,3,Data Scientist,Merkle Inc.,"Job Description\n\nDesign, develop, test, deploy and maintain machine learning techniques for Merkle data products. The products include identify, feature generation, propensity models, look-alike...",https://www.indeed.com/viewjob?jk=9c9be388d6427692&from=serp&vjs=3,,1
2,4,Data Scientist,Deloitte,"Deloitte Services LP includes internal support areas such as Sales Excellence, Marketing and Communications, Human Resources/Talent, Information Technology, Facilities Management, and Financial Su...",https://www.indeed.com/viewjob?jk=1f37f96bae2be3f8&from=serp&vjs=3,,1
3,5,Data Scientist|| Machine Learning,Expedite LLC,"Job SummaryJob Title: Data Scientist|| Machine learningWork Location: Bloomfield, CODuration: 12+monthsMust have skills (1-5):1. Understanding of Statistics2. Statistical Programming3. Statistical...",https://www.indeed.com/viewjob?cmp=Expedite-LLC&t=Data+Scientist+Machine+Learning&jk=8c6aa71b57967d9a&vjs=3,,1
4,6,Data Scientist,Honey,"Honey is a fast-growing startup based in Los Angeles. Our online shopping platform offers users a smarter way to shop. Through a simple browser extension, we open up instant access to exclusive sa...",https://www.indeed.com/viewjob?jk=bad61334b2d1c88a&from=serp&vjs=3,,1


In [149]:
# Look again for the correct indices
df8[df8["company"] == "No company found."]

Unnamed: 0,index,job_title,company,description,job_url,keywords_present,page_found
54,83,Data Engineer (Mid and Senior),No company found.,"Data Engineer, Mid to Senior Level\nDatalere team members lead by example, focus on customer needs and have a thirst to learn all they can about data analytics. Successful candidates are self-star...",https://www.indeed.com/viewjob?jk=ab2f7b23069fe431&from=serp&vjs=3,,12
64,122,Data Scientist for Materials Science,No company found.,"Posting Title\nData Scientist for Materials Science\n.\nLocation\nCO - Golden\n.\nPosition Type\nLimited Term (Fixed Term)\n.\nHours Per Week\n40\n.\nJob Description\nThe Data, Analysis and Visual...",https://www.indeed.com/viewjob?jk=d827816a6c40638f&from=serp&vjs=3,,3
68,135,Analyst,No company found.,"Reach Your Peak at Vail Resorts. You're someone who pushes boundaries and challenges the status quo. You're brave, ambitious and passionate in everything you do. And we want you on our team. Pursu...",https://www.indeed.com/viewjob?jk=6d05ab5d579549e2&from=serp&vjs=3,,5
77,156,Data Scientist,No company found.,"(6605)\n\nData scientists use data and analytical ability to find and interpret rich data sources; manage large amounts of data despite hardware, software, and bandwidth constraints; merge data so...",https://www.indeed.com/viewjob?jk=6543e0fe2cc49c33&tk=1dr7il64533g4001&from=serp&vjs=3&advn=2509720981551603&adid=325531189&sjdu=i6xVERweJM_pVUvgf-Mzud2KHack-wE2Lb7KQz0L0LaB6A3rKzwVBeWWBkx9kBdQ,,12
85,169,Python Data Engineer,No company found.,"As a data engineer, your general responsibilities will include bolting together the core components of our platform that allow us to interact with utility company data stores, helping build system...",https://www.indeed.com/viewjob?jk=7d521d3b4c8d8915&from=serp&vjs=3,,13
135,55,Senior Data Engineer,No company found.,"Company Overview\n\nFanatics is the global leader in licensed sports merchandise and changing the way fans purchase their favorite team apparel and jerseys. Through an innovative, tech-infused app...",https://www.indeed.com/viewjob?jk=14d283ce293fc6eb&from=serp&vjs=3,,17
139,79,Data Scientist Snr Manager,No company found.,Data Scientist Snr Manager-19001924\n\n\nPreferred Qualifications\n\nJob Description:\n\nThe Oracle Data Cloud is on the cutting edge in development of measurement and targeting methodologies for ...,https://www.indeed.com/viewjob?jk=0dc435180e844868&from=serp&vjs=3,,18
155,108,Data Intelligence/Analytics Engineer,No company found.,"Job Title: Data Intelligence / Analytics Engineer Company Summary Logical Systems (LSI) LLC, is an established multi-disciplinary engineering company founded in Memphis, Tennessee in 1985. Today, ...",https://www.indeed.com/viewjob?cmp=Logical-Systems-Incorporated&t=Data+Intelligence+Analytic+Engineer&jk=8a02e5b9dbc466ef&vjs=3,,16


In [156]:
# Datalere
df8.iloc[85]["company"]

'No company found.'

In [157]:
replacements = {
    54: "Datalere",
    64: "National Renewable Energy Laboratory",
    68: "Vail Resorts",
    77: "Horizontal",
    85: "Horizontal",
    135: "Fanatics Inc",
    139: "Oracle",
    155: "Logical Systems Incorporated",
}

for co in replacements:
    replace_value(df8, co, "company", replacements[co])

No company found.
Datalere
No company found.
National Renewable Energy Laboratory
No company found.
Vail Resorts
No company found.
Horizontal
No company found.
Horizontal
No company found.
Fanatics Inc
No company found.
Oracle
No company found.
Logical Systems Incorporated


In [159]:
# Confirm it worked as expected
df8[df8["company"] == "No company found."]

Unnamed: 0,index,job_title,company,description,job_url,keywords_present,page_found


In [161]:
pd.options.display.max_rows = 200
df8["company"].value_counts()

Deloitte                                     6
Booz Allen Hamilton                          5
CenturyLink                                  5
xentity corporation                          4
Horizontal                                   4
LOCKHEED MARTIN CORPORATION                  4
Aegon                                        3
Vail Resorts                                 3
Ball Aerospace                               3
Comcast                                      3
CBRE                                         3
Pearson                                      3
University of Colorado Boulder               3
National Renewable Energy Laboratory         3
CitiusTech                                   2
Oracle                                       2
University of Colorado                       2
DISH                                         2
JumpCloud                                    2
Dataiku                                      2
Fluid Truck Share                            2
Cloud Agronom

In [162]:
df8.head()

Unnamed: 0,index,job_title,company,description,job_url,keywords_present,page_found
0,1,Data Scientist,Aegis Premier Technologies,"About Us We are a group of passionate technology professionals who are committed to delivering best-in-class data, data science and software solutions for the non-profit space. We care deeply abou...",https://www.indeed.com/viewjob?cmp=Aegis-Premier-Technologies&t=Data+Scientist&jk=d8891916469e042c&sjdu=QwrRXKrqZ3CNX5W-O9jEveJgZ7DRbh_ySwPONsqRa9ZW4N-0leXNJ1nUuiJ-kR7vVSLM815TvZxPISFQtteDJw&adid=...,,1
1,3,Data Scientist,Merkle Inc.,"Job Description\n\nDesign, develop, test, deploy and maintain machine learning techniques for Merkle data products. The products include identify, feature generation, propensity models, look-alike...",https://www.indeed.com/viewjob?jk=9c9be388d6427692&from=serp&vjs=3,,1
2,4,Data Scientist,Deloitte,"Deloitte Services LP includes internal support areas such as Sales Excellence, Marketing and Communications, Human Resources/Talent, Information Technology, Facilities Management, and Financial Su...",https://www.indeed.com/viewjob?jk=1f37f96bae2be3f8&from=serp&vjs=3,,1
3,5,Data Scientist|| Machine Learning,Expedite LLC,"Job SummaryJob Title: Data Scientist|| Machine learningWork Location: Bloomfield, CODuration: 12+monthsMust have skills (1-5):1. Understanding of Statistics2. Statistical Programming3. Statistical...",https://www.indeed.com/viewjob?cmp=Expedite-LLC&t=Data+Scientist+Machine+Learning&jk=8c6aa71b57967d9a&vjs=3,,1
4,6,Data Scientist,Honey,"Honey is a fast-growing startup based in Los Angeles. Our online shopping platform offers users a smarter way to shop. Through a simple browser extension, we open up instant access to exclusive sa...",https://www.indeed.com/viewjob?jk=bad61334b2d1c88a&from=serp&vjs=3,,1


In [163]:
df8 = df8.drop(columns=["index"])
df8.head()

Unnamed: 0,job_title,company,description,job_url,keywords_present,page_found
0,Data Scientist,Aegis Premier Technologies,"About Us We are a group of passionate technology professionals who are committed to delivering best-in-class data, data science and software solutions for the non-profit space. We care deeply abou...",https://www.indeed.com/viewjob?cmp=Aegis-Premier-Technologies&t=Data+Scientist&jk=d8891916469e042c&sjdu=QwrRXKrqZ3CNX5W-O9jEveJgZ7DRbh_ySwPONsqRa9ZW4N-0leXNJ1nUuiJ-kR7vVSLM815TvZxPISFQtteDJw&adid=...,,1
1,Data Scientist,Merkle Inc.,"Job Description\n\nDesign, develop, test, deploy and maintain machine learning techniques for Merkle data products. The products include identify, feature generation, propensity models, look-alike...",https://www.indeed.com/viewjob?jk=9c9be388d6427692&from=serp&vjs=3,,1
2,Data Scientist,Deloitte,"Deloitte Services LP includes internal support areas such as Sales Excellence, Marketing and Communications, Human Resources/Talent, Information Technology, Facilities Management, and Financial Su...",https://www.indeed.com/viewjob?jk=1f37f96bae2be3f8&from=serp&vjs=3,,1
3,Data Scientist|| Machine Learning,Expedite LLC,"Job SummaryJob Title: Data Scientist|| Machine learningWork Location: Bloomfield, CODuration: 12+monthsMust have skills (1-5):1. Understanding of Statistics2. Statistical Programming3. Statistical...",https://www.indeed.com/viewjob?cmp=Expedite-LLC&t=Data+Scientist+Machine+Learning&jk=8c6aa71b57967d9a&vjs=3,,1
4,Data Scientist,Honey,"Honey is a fast-growing startup based in Los Angeles. Our online shopping platform offers users a smarter way to shop. Through a simple browser extension, we open up instant access to exclusive sa...",https://www.indeed.com/viewjob?jk=bad61334b2d1c88a&from=serp&vjs=3,,1


In [165]:
df8.shape

(176, 6)

---

## The Final Export

Now the dataset is all ready to go. Last export for this notebook.

The final shape is `(176, 6)`.

Not too shabby!

In [164]:
# Final csv before NLP
df8.to_csv("19-12-03-indeed_jobs.csv", index=False)