In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from six.moves.urllib import parse
import re
import time
%matplotlib inline

In [3]:
loc_dict = {"California":2280,
            "Alabama":105
            }

In [7]:
def get_totpg_linktemp(location):
    locid = loc_dict[location]
    init_url = 'https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn'\
                '&typedKeyword=data+scientist&sc.keyword=data+scientist&locT=S&locId={}&jobType='.format(locid)
    website_raw = requests.get(init_url, headers={'User-Agent':user_agent}).text
    soup = BeautifulSoup(website_raw, 'html.parser')
    count_text = soup.find('p','jobsCount hidden').get_text()
    totpg = int(count_text.split()[0].replace(',',''))
    print("Location: ", location, "; Total search result:", totpg)
    linktemp = soup.find('li',{'class':'page current'}).find('a').get('href')
    print(linktemp)
    return totpg, linktemp

In [8]:
location = 'California'
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36'
totpg, linktemp = get_totpg_linktemp(location)

Location:  California ; Total search result: 4695
/Job/california-data-scientist-jobs-SRCH_IL.0,10_IS2280_KO11,25.htm?p=1


In [10]:
def scrape_each_state(state,df):
    totn, linktemp = get_totpg_linktemp(state)
    linktemp = linktemp.replace('.htm?p=1','')
    tot_pages = totn//30+1
    for page_num in range(1,tot_pages+1):
        link = 'https://www.glassdoor.com'+linktemp+'_IP{}.htm'.format(str(page_num))
        print(link)
        website_raw = requests.get(link, headers={'User-Agent':user_agent}).text
        soup = BeautifulSoup(website_raw, 'html.parser')
        #print(soup.prettify())
        results = soup.find(name = 'ul',attrs = {'class':'jlGrid'})
        #print(results)
        if results == None:
            continue
        all_li_results = results.find_all(name='li', attrs= {'class':'jl react-job-listing gdGrid'})
        #print(all_li_results)

        for li_result in all_li_results:
            currenRow = {}
            try:
                titles = li_result.find_all(name = 'a',attrs = {'class':'jobInfoItem jobTitle jobLink'})
                #print(titles)
                if len(titles) == 1:
                    currenRow['Company'] = titles[0].get_text().strip()
                elif len(titles) == 2:
                    currenRow['Company'] = titles[0].get_text().strip()
                    currenRow['Title'] = titles[1].get_text().strip()
                else:
                    currenRow['Company'] = None
                    currenRow['Title'] = None
            except:
                currenRow['Company'] = None
                currenRow['Title'] = None

            try:
                location = li_result.find('span',{'class':'subtle loc css-nq3w9f pr-xxsm'})
                currenRow['Location'] = location.get_text().strip()
            except:
                currenRow['Location'] = None

            try:
                Post_date = li_result.find('div',{'data-test':'job-age'})
                currenRow['Post_date'] = Post_date.get_text().strip()
            except:
                currenRow['Post_date'] = None
            currenRow["State"] = state
            df = df.append(currenRow, ignore_index=True)
    return df

In [11]:
state = 'Alabama'
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36'
df = pd.DataFrame(columns=["Title", "Company", "State", "Location", "Post_date"])
df = scrape_each_state(state, df)

Location:  Alabama ; Total search result: 141
/Job/alabama-data-scientist-jobs-SRCH_IL.0,7_IS105_KO8,22.htm?p=1
https://www.glassdoor.com/Job/alabama-data-scientist-jobs-SRCH_IL.0,7_IS105_KO8,22_IP1.htm
https://www.glassdoor.com/Job/alabama-data-scientist-jobs-SRCH_IL.0,7_IS105_KO8,22_IP2.htm
https://www.glassdoor.com/Job/alabama-data-scientist-jobs-SRCH_IL.0,7_IS105_KO8,22_IP3.htm
https://www.glassdoor.com/Job/alabama-data-scientist-jobs-SRCH_IL.0,7_IS105_KO8,22_IP4.htm
https://www.glassdoor.com/Job/alabama-data-scientist-jobs-SRCH_IL.0,7_IS105_KO8,22_IP5.htm


In [12]:
df

Unnamed: 0,Title,Company,State,Location,Post_date
0,Data Scientist,Redstone Federal Credit Union,Alabama,"Huntsville, AL",10d
1,Data Analyst - Digital Transformation,LG Electronics Alabama Inc. (LGEAI) - Service,Alabama,"Huntsville, AL",16d
2,Data Scientist,Better Hire,Alabama,"Birmingham, AL",21d
3,Data Scientist 2,Humana,Alabama,"Birmingham, AL",30d+
4,Data Scientist,Regions Financial,Alabama,"Hoover, AL",7d
...,...,...,...,...,...
140,Senior Pharmaceutical Development Scientist,PCI Pharma Services,Alabama,"Tredegar, AL",21d
141,ICE HSI Senior Operations and Data Analyst,E3 Federal Solutions,Alabama,"Huntsville, AL",7d
142,Data Engineer with Security Clearance,"Davidson Technologies, Inc.",Alabama,"Huntsville, AL",30d+
143,Data Analyst â Junior,"Staffigo Technical Services, LLC",Alabama,"Birmingham, AL",16d


In [14]:
df = df.drop_duplicates(['Title', 'Company','Location']).reset_index(drop=True)
df

Unnamed: 0,Title,Company,State,Location,Post_date
0,Data Scientist,Redstone Federal Credit Union,Alabama,"Huntsville, AL",10d
1,Data Analyst - Digital Transformation,LG Electronics Alabama Inc. (LGEAI) - Service,Alabama,"Huntsville, AL",16d
2,Data Scientist,Better Hire,Alabama,"Birmingham, AL",21d
3,Data Scientist 2,Humana,Alabama,"Birmingham, AL",30d+
4,Data Scientist,Regions Financial,Alabama,"Hoover, AL",7d
...,...,...,...,...,...
139,Senior Pharmaceutical Development Scientist,PCI Pharma Services,Alabama,"Tredegar, AL",21d
140,ICE HSI Senior Operations and Data Analyst,E3 Federal Solutions,Alabama,"Huntsville, AL",7d
141,Data Engineer with Security Clearance,"Davidson Technologies, Inc.",Alabama,"Huntsville, AL",30d+
142,Data Analyst â Junior,"Staffigo Technical Services, LLC",Alabama,"Birmingham, AL",16d
