## resources
1. https://dev.to/hhsm95/using-user-agent-to-scraping-data-lli
2. [How to Web Scrape Indeed with Python](https://www.youtube.com/watch?v=PPcgtx0sI2E) by John Watson Rooney
3. [Comprehensive Python Beautiful Soup Web Scraping Tutorial!](https://www.youtube.com/watch?v=GjKQ6V_ViQE&t=2205s) by Keith Galli
4. [How to scrape JOB posts from INDEED with PYTHON](https://www.youtube.com/watch?v=eN_3d4JrL_w&lc=Ugw9P4LYvEssGrIcNf94AaABAg.9FOng9tpc_Q9FOtU0NVkpR) by Izzy Analytics

## note to self
1. status_code == 200
    - a HTTP status code, means "OK", the server has succesfully answered our http request.
    - more info: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
2. [f-strings in python](https://www.geeksforgeeks.org/formatted-string-literals-f-strings-python/)
3. So many duplicated job posts on indeed.com... soooooooo many
4. card.find/find_all('span', 'class_'): 'class_' is by default

In [87]:
from datetime import datetime #to get the current date
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import numpy as np

In [98]:
def extract(job_title, location):
    inquiry = 'https://www.indeed.com/jobs?q={}&l={}&filter=0'
    job_title = job_title.replace(' ', '+')
    location = location.replace(' ', '+')
    url = inquiry.format(job_title, location)
    return url

In [99]:
def transform(card):
    title = card.h2.a.get('title')
    job_link = 'https://www.indeed.com' + card.h2.a.get('href')
    company = card.find('span', class_ = 'company').text.strip()
    try:
        rating = card.find('span', class_ = 'ratingsContent').text.strip()
    except AttributeError:
        rating = ''
    location = card.find('div', class_ = 'recJobLoc').get('data-rc-loc')
    try:
        salary = card.find('span', 'salaryText').text.strip()
    except AttributeError:
        salary = ''
    summary = card.find('div', 'summary').text.strip()
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
        
    job = {'title': title,
           'company': company,
           'rating': rating,
           'location': location,
           'salary': salary,
           'summary': summary,
           'post_date': post_date,
           'record obtained': today,
           'job_url': job_link
        }
    return job

In [100]:
def get_jobs(job_title, location):
    joblist = []
    url = extract(job_title, location)
    
    while True:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
        delays = [7, 4, 6, 2, 10, 19]
        delay = np.random.choice(delays)
        r = requests.get(url, headers)
        time.sleep(delay)
        soup = bs(r.content, 'html.parser')
        cards = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
        for index, card in enumerate(cards):
            job = transform(card)
            joblist.append(job)
            print('moving along', index)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break

    data = pd.DataFrame(joblist)
    data.to_csv(job_title + '_jobs_indeed.csv')
    print('JOB FINISHED!')

In [101]:
get_jobs('bioinformatics', 'united states')

moving along 0
moving along 1
moving along 2
moving along 3
moving along 4
moving along 5
moving along 6
moving along 7
moving along 8
moving along 9
moving along 10
moving along 11
moving along 12
moving along 13
moving along 14
moving along 15
moving along 16
moving along 17
moving along 0
moving along 1
moving along 2
moving along 3
moving along 4
moving along 5
moving along 6
moving along 7
moving along 8
moving along 9
moving along 10
moving along 11
moving along 12
moving along 13
moving along 14
moving along 0
moving along 1
moving along 2
moving along 3
moving along 4
moving along 5
moving along 6
moving along 7
moving along 8
moving along 9
moving along 10
moving along 11
moving along 12
moving along 13
moving along 14
moving along 15
moving along 16
moving along 17
moving along 0
moving along 1
moving along 2
moving along 3
moving along 4
moving along 5
moving along 6
moving along 7
moving along 8
moving along 9
moving along 10
moving along 11
moving along 12
moving along 13


In [None]:
def transform(soup):
    joblist = []
    cards = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    for card in cards:
        atag = card.h2.a
        title = atag.get('title')
        job_link = 'https://www.indeed.com' + atag.get('href')
        company = card.find('span', class_ = 'company').text.strip()
        try:
            rating = card.find('span', class_ = 'ratingsContent').text.strip()
        except AttributeError:
            rating = ''
        location = card.find('div', class_ = 'recJobLoc').get('data-rc-loc')
        try:
            salary = card.find('span', 'salaryText').text.strip()
        except AttributeError:
            salary = ''
        summary = card.find('div', 'summary').text.strip()
        post_date = card.find('span', 'date').text
        today = datetime.today().strftime('%Y-%m-%d')
        
        job = {
            'title': title,
            'company': company,
            'rating': rating,
            'location': location,
            'salary': salary,
            'summary': summary,
            'post_date': post_date,
            'record obtained': today,
            'job_url': job_link
        }
        joblist.append(job)
        
    return

In [318]:
def extract(page):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    url = f'https://www.indeed.com/jobs?q=data%20scientist&l=United%20States&start={page}'
    r = requests.get(url, headers)
    #return r.status_code: checkpoint
    soup = bs(r.content, 'html.parser')
    #return print(soup.prettify()): checkpoint
    return soup
    
def transform(soup):
    divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    for item in divs:
        #print(item)
        title = item.find('a').text.strip()
        company = item.find('span', class_ = "company").text.strip()
        try:
            rating = item.find('span', class_ = 'ratingsContent').text.strip()
        except:
            rating = ""
        try:
            location = item.find('span', class_ = 'location accessible-contrast-color-location').text.strip()
        except:
            location = item.find('div', class_ = 'location accessible-contrast-color-location').text.strip()
        #print(location)
        summary = item.find('div', class_ = 'summary').text.strip()
        
        try:
            salary = item.find('span', class_ = 'salaryText').text.strip()
        except:
            salary = ''
        
        job = {
            'title': title,
            'company': company,
            'rating': rating,
            'location': location,
            'salary': salary,
            'summary': summary
        }
        joblist.append(job)
    return

In [None]:
url2 = f'https://www.indeed.com/jobs?q=data%20scientist&l=United%20States'

In [954]:
joblist = []

for i in range(5200,5300,10):
    print(f'Getting information from page, {i}')
    c = extract(i)
    transform(c)

Getting information from page, 5200
Getting information from page, 5210
Getting information from page, 5220
Getting information from page, 5230
Getting information from page, 5240
Getting information from page, 5250
Getting information from page, 5260
Getting information from page, 5270
Getting information from page, 5280
Getting information from page, 5290


In [955]:
df103 = pd.DataFrame(joblist)

In [956]:
table = pd.concat([df, df1, df2, df3, df4, df5,
                  df6, df7, df8, df9, df10, df11,
                  df12, df13, df14, df15, df16,
                  df17, df18, df19, df20, df21,
                  df22, df23, df24, df25, df26,
                  df27, df28, df29, df30, df31,
                  df32, df33, df34, df35, df36,
                  df37, df38, df39, df40, df41,
                  df42, df43, df44, df45, df46,
                  df47, df48, df49, df50, df51,
                  df52, df53, df54, df55, df56,
                  df57, df58, df59, df60, df61,
                  df62, df63, df64, df65, df66,
                  df67, df68, df69, df70, df71,
                  df72, df73, df74, df75, df76,
                  df77, df78, df79, df80, df81,
                  df82, df83, df84, df85, df86,
                  df87, df88, df89, df90, df91,
                  df92, df93, df94, df95, df96,
                  df97, df98, df99, df100, df101,
                  df102, df103], ignore_index=True)

In [957]:
table.to_csv('allJobs.csv')

In [958]:
table.tail(5)

Unnamed: 0,title,company,rating,location,salary,summary
8087,Data Scientist,Money Mart Financial Services,2.4,"Malvern, PA 19355",,"Minimum 5 years of experience using SQL, R, Py..."
8088,Data Scientist,Randstad Professionals,3.7,"Cockeysville, MD 21030","$120,000 - $145,000 a year",At least 3 years of experience with data migra...
8089,Data Scientist Intern,Henkel,3.9,"Irvine, CA 92614 (Business District area)",,Data scientist intern position starting in Spr...
8090,Senior Data Scientist - Permanent Full Time,Veritas Partners,,"Cockeysville, MD 21030","$131,000 - $150,000 a year",Extract meaningful statistics and other data c...
8091,Data Scientist II - Permanent Full Time Opport...,Veritas Partners,,"Cockeysville, MD 21030","$100,000 - $135,000 a year","3+ years of data taxonomy, BI, data migration ..."


In [959]:
table.shape

(8092, 6)

In [960]:
table_uniq = table.drop_duplicates()
table_uniq.shape

(1053, 6)

In [966]:
table_uniq.to_csv('jobs_dataScience.csv', index=False)

In [963]:
table_uniq.head()

Unnamed: 0,title,company,rating,location,salary,summary
0,,Leidos,3.7,"Pittsburgh, PA 15236",,Work individually and on local and geographica...
1,Data Scientist - Fully Remote,DataSource.ai,,"San Francisco, CA","$113,114 - $221,749 a year","As part of our team, you will leverage your de..."
2,Jr. Data Scientist,Dash Technologies Inc,,"Kansas City, KS","$76,029 - $105,000 a year",Work with developers to design algorithms and ...
3,Data Scientist,Epace Technology,,United States,,"Understanding data, data problem statements, d..."
4,Data Scientist Consultant,Zora Digital,3.7,Remote,Up to $75 an hour,Work closely with data scientists and backend ...


In [962]:
#duplicateRowsDF = table[table.duplicated()]

In [932]:
#table.loc[(table['title'] == 'Lead Data Scientist') & (table['company'] == 'PayPal')]

Unnamed: 0,title,company,rating,location,salary,summary
0,,Leidos,3.7,"Pittsburgh, PA 15236",,Work individually and on local and geographica...
34,Junior Data Scientist,Leidos,3.7,"Morgantown, WV 26501 (First Ward area)",,Work individually and on local and geographica...
212,Junior Data Scientist,Leidos,3.7,"Pittsburgh, PA 15236",,Work individually and on local and geographica...
759,,Leidos,3.7,"McLean, VA 22102",,Familiarity with relational data and quantitat...
1023,,Leidos,3.7,"Tucson, AZ 85711 (Rosemont West area)",,Requires PhD or current PhD program and 3 - 5 ...
1040,Image Data Scientist.,Leidos,3.7,"Tucson, AZ 85711 (Rosemont West area)",,Requires PhD or current PhD program and 3 - 5 ...
1471,Data Scientist,Leidos,3.7,"McLean, VA 22102",,Familiarity with relational data and quantitat...


In [None]:
#def get_url(job_title, location):
#   temp = 'https://www.indeed.com/jobs?q={}&l={}'
#    url = temp.format(job_title, location)
#   return url
#url = get_url('bioinformatics', 'united states')
#response = requests.get(url)
#response