In [53]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
from fuzzywuzzy import fuzz

In [2]:
dct_Exp = {'Director|Dir\.?|VP|Principal|IV': 'Executive', 
           'Senior|Sr\.?|Manager|Lead|Gestionnaire|III': 'Senior',
           'II':'Intermediate',
           'Entry Level|Associate|Student|Junior|Jr\.?|Stagiaire': 'Entry Level'} 

dct_Emp = {'Full-?time':'Full-time', 'Part-?time':'Part-time', 'Contract':'Contract',
           'Freelance':'Freelance', 'Co-?op|Intern|Stagiaire':'Internship'}

tags = ['li','div','p']

In [53]:
def close_popup(soup):
    time.sleep(2)
    signup = soup.find('div', {'id':'LoginModal'})
    # popup.find('span', alt='Close'), path: '/html/body/div[@id="LoginModal"]/div/div/div[2]/span'
    if signup and signup.find('button', class_='e1jbctw80 ei0fd8p1 css-1n14mz9 e1q8sty40'):
        #driver.find_element(By.CLASS_NAME, 'e1jbctw80 ei0fd8p1 css-1n14mz9 e1q8sty40').click()
        driver.find_element(By.XPATH, '/html/body/div[@id="LoginModal"]/div/div/div/div[2]/button').click()
        time.sleep(2)
        
    verify = soup.find('form', id='challenge-form')
    if verify:
        time.sleep(120)
        
        
def try_tag(data, pattern):
    
    global tags
    for tag in tags:
        result = data.find(tag, text=pattern)
        if result:
            return result
    return None

In [4]:
def get_emp_type(data, title):
    
    global dct_Emp
    for key in dct_Emp:
        emp_type = re.search(r'\b('+key+r')\b', title, re.IGNORECASE)
        if emp_type:
            return dct_Emp[key]
    
    emp_detail = try_tag(data, re.compile('Job Type'))
    if emp_detail:
        #print(title,'|' ,emp_detail.text)
        for key in dct_Emp:
            emp_type = re.search(r'\b('+key+r')\b', emp_detail.text, re.IGNORECASE)
            if emp_type:
                return dct_Emp[key]
            else:
                if try_tag(data, re.compile(key)):
                    return dct_Emp[key]
        return 'Full-time'
    else:
        return 'Full-time'


def get_exp_lvl(desc, title, emp_type):
    
    global dct_Exp
    if emp_type == 'Internship':
        return 'Entry Level'
    
    for key in dct_Exp:
        exp_lvl = re.search(r'\b('+key+r')\b', title, re.IGNORECASE)
        if exp_lvl:
            return dct_Exp[key]
        
    pattern = re.compile(r'\d+.*year.*experience|experience.*\d+.*year|.*\d+.*year(?!.*contract)', re.IGNORECASE)
    exp_lvl = try_tag(desc, pattern)
    if exp_lvl:   
        exp_year = int(re.search('[0-9][0-9]|[0-9]', exp_lvl.text).group())
        if exp_year < 3:
            return 'Entry Level'
        elif exp_year < 6:
            return 'Mid Level'
        else:
            return 'Senior'
    else:
        return 'Mid Level'

    
def get_company_info(company_info):
    
    details = company_info.find_all('span', class_='css-1taruhi e1pvx6aw1')
    data = company_info.find_all('span', class_='css-i9gxme e1pvx6aw2')
    company_size,company_revenue,company_type,industry = 'Unknown','Unknown','Unknown','Unknown'
    
    for i, detail in enumerate(details):
        if details[i].text == 'Size':
            company_size = data[i].text
        elif details[i].text == 'Revenue':
            company_revenue = data[i].text
        elif details[i].text == 'Type':
            company_type = data[i].text
        elif details[i].text == 'Sector':
            industry = data[i].text
        else:
            continue
            
    return company_size, company_revenue, company_type, industry


In [5]:
def scrape_data(data, dct):
    
    title = data.find('div', {'data-test':'jobTitle'}).text
    emp_type = get_emp_type(data, title)
    desc = data.find('div', class_='jobDescriptionContent desc')
    exp_lvl = get_exp_lvl(desc, title, emp_type)
    #company = data.find('div', class_='css-w04er4 e1tk4kwz6').find_all('div')[0].text[:-3]
    company = data.find('div', {'data-test':'employerName'}).text[:-3]
    location = data.find('div', {'data-test':'location'}).text
    salary = data.find('div', class_='css-1bluz6i e2u4hf13').text[:-6]
    company_info = data.find('div', id='CompanyContainer')
    if company_info:
        company_size,company_revenue,company_type,industry = get_company_info(company_info)
    else:
        company_size,company_revenue,company_type,industry = 'Unknown','Unknown','Unknown','Unknown'
    
    dct['title'].append(title)
    dct['employment_type'].append(emp_type)
    dct['experience_lvl'].append(exp_lvl)
    dct['company'].append(company)
    dct['location'].append(location)
    dct['company_size'].append(company_size)
    dct['company_revenue'].append(company_revenue)
    dct['company_type'].append(company_type)
    dct['industry'].append(industry)
    dct['salary'].append(salary)
    

In [54]:
def scrape_jobs(url, next_page_button):
    
    driver.get(url)
    next_page_true = True
    dct = {'title':[], 'employment_type':[], 'experience_lvl':[], 'company':[], 'location':[], 
           'company_size':[], 'company_revenue':[], 'company_type':[], 'industry':[], 'salary':[]}
    
    while next_page_true: 
        #verify(BeautifulSoup(driver.page_source, 'html.parser'))
        post = driver.find_element(By.TAG_NAME, 'article').find_element(By.TAG_NAME, 'ul')\
        .find_elements(By.TAG_NAME, 'li')
        
        i = 0
        while i < len(post):
            close_popup(BeautifulSoup(driver.page_source, 'html.parser'))
            post[i].click()
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            data = soup.find('div', {'id':'JDCol'})
            err = data.find('h3', text=re.compile('Error Loading'))
            if err:
                driver.refresh()
                post = driver.find_element(By.TAG_NAME, 'article').find_element(By.TAG_NAME, 'ul')\
                .find_elements(By.TAG_NAME, 'li')
                time.sleep(2)
                i -= 1
            elif data.find('div', class_='salaryTab tabSection p-std'):
                scrape_data(data, dct)
            i += 1
                
        current_page_url = driver.current_url
        close_popup(BeautifulSoup(driver.page_source, 'html.parser'))
        driver.find_element(By.XPATH, next_page_button).click()
        time.sleep(2)
        if current_page_url == driver.current_url:
            next_page_true = False 

    return pd.DataFrame(dct, columns=dct.keys())

In [38]:
driver.close()

In [None]:
url = 'https://www.glassdoor.ca/Job/data-science-jobs-SRCH_KO0,12.htm'
#url = 'https://www.glassdoor.com/Job/data-science-jobs-SRCH_KO0,12.htm'
next_page_button = '/html/body/div[2]/div/div/div/div/div[@id="JobResults"]/section/article/div[2]/div/div[1]/button[7]'
driver = webdriver.Chrome()
driver.get(url)
df = scrape_jobs(url, next_page_button)
driver.close()
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)

In [105]:
field = {'Big Data':'Big Data', 
         'Computer Vision':'Computer Vision',  
         'BI|(Business|Marketing) Intelligence':'BI', 
         'ML|Machine Learning|Apprentissage Machine|DL|Deep Learning':'ML', 
         'A\.?I|Artificial Intelligence|Intelligence Artificielle|NLP|Natural Language Processing':'AI',
         '(Business|Marketing|Portfolio|People) Analy(st|tics)':'Business Analytics',
         'Data Architect(|ure)':'Data Architecture',
         'Mine Data|Data (Min(er|ing)|Extraction)':'Data Mine',
         'Data Visualization':'Data Visualization',
         'Data (Manage(r|ment)|Governance|Planning)':'Data Management',
         'Data Engineer(|ing)|Ingénierie des données':'Data Engineering',
         '(?<!Big )Data\W+Analy(st|tics)|Analy(ste|tique) de données':'Data Analytics',
         '(?<!Big )Data Scien(ce|tist)|Scien(tifique|ce) des données':'Data Science',
         'Statistic(s|al|ian)|Statistiques':'Statistical',
         'Mathematic(s|al|ian)':'Mathematical'}


field2= {'(?<!Big )Data(?!\W+(Architect(|ure)|Mine(|r)|Extraction|Management|Governance|Planning|Engineer(|ing)|'
         'Analy(st|tics)|Scien(ce|tist)))':'Data',
         'Software|logiciel(|s)':'Software',
         '(?<!Business )(?<!(?<!Big )Data )(?<!& )Analy(st|tics)|Analysis|Analy(ste|tique)(?! de Données)':'Analytics',
         '(?<!Data )Architect(|ure)':'Architecture',
         'Business|Marketing(?!\W+(Intelligence|Analy(st|tics)))':'Business',
         'Develop(er|ment)|développeur':'Development',
         '(?<!Data )Eng(|ineer|ing)':'Engineering', 
         'Modeling':'Modeling', 
         'Programm(er|ing)':'Programming',
         'Research(|er)':'Research',  
         '(?<!Data )Scien(ce|tist|tifique)(?! des)':'Science',
         'Consult(ing|ant)|Advisory':'Consulting'}

          
pfsn1 = {'Architect(|e)':'Architect', 
         'Designer':'Designer', 
         'Miner':'Miner',
         'Statistician|Mathematician':'Statistician/Mathematician',
         'Advisor|Consultant|Conseill(er|ère)':'Consultant', 
         'Specialist|Spécialiste|Expert(|e)|Representative':'Specialist',
         'Eng(|ineer)|Technologist':'Engineer',
         'Programmer|Programmeu(r|se)':'Programmer',
         'Developer|Développeur':'Developer',
         'Analyst(|e)':'Analyst',
         'Scientist|Scientifique':'Scientist', 
         'Researcher|Chercheu(r|se)|Investigator':'Researcher',
         'Co-?op|Intern|Student|Fellow(|ship)|Stagiaire':'Student',
         'Educator|Instructor|Lecturer|Train(er|ing)|Sessional Faculty|Chargé de cours|'
         'Professor|Professeur':'Instructor',
         '(?<=Data )Manager':'Manager'}


pfsn2 = {'Architecture':'Architect',
         'Min(e|ing)|Extraction':'Miner',
         'Mathematic(al|s)|Statistic(al|s)|Statistique(|s)':'Statistician/Mathematician',
         'Engineering':'Engineer',
         'Programming':'Programmer',
         'Analytics|Analysis|Analytique':'Analyst',
         'Research':'Researcher',
         'Science':'Scientist',
         'Consulting|Advisory':'Consultant',
         'Manager|Gestionnaire|(Project |)(Lead|Management)|Governance|Planning|'
         'Coordinator|Coordonnat(rice|eur)|Facilitator|Planner':'Manager'}


In [62]:
def convert_size(size):
    if size == 'Unknown':
        return size
    elif size == '1 to 50 Employees':
        return 'Small'
    elif size == '51 to 200 Employees' or size == '201 to 500 Employees':
        return 'Medium'
    else:
        return 'Large'
    
    
def convert_revenue(rev):
    if re.search('Unknown', rev, re.IGNORECASE):
        return 'Unknown'
    else:
        return re.sub('to','-',rev)
    
    
def convert_salary(sal):
    if sal[-3:-1] == 'hr':
        return round(float('.'.join(re.findall('\d+', sal)))*40*52)
    elif sal[-3:-1] == 'mo':
        return int(''.join(re.findall('\d+', sal)))*12
    else:
        return int(''.join(re.findall('\d+', sal)))
        
    
def generalize_title(title):
    global field
    global field2
    global pfsn1
    global pfsn2
    
    ttl1 = ''
    for key in field:
        if re.search(r'\b('+key+r')\b', title, re.IGNORECASE):
            ttl1 = field[key] + ' '
            break
            
    if re.search(r'\b(Applied|appliquée)\b', title, re.IGNORECASE):
        ttl1 = 'Applied ' + ttl1
    
    ttl2 = ''
    for key in field2:
        if re.search(r'\b('+key+r')\b', title, re.IGNORECASE):
            ttl2 = ttl2 + field2[key] + ' '
    
    ttl3 = ''
    for key in pfsn1:
        if re.search(r'('+key+r')\b', title, re.IGNORECASE):
            ttl3 = pfsn1[key]
            break
            
    if not(ttl3):
        for key in pfsn2:
            if re.search(r'\b('+key+r')\b', title, re.IGNORECASE):
                ttl3 = pfsn2[key]
                break
           
    words = ''.join([ttl1,ttl2]).split()
    ttl = ''
    for word in words:
        if fuzz.partial_ratio(ttl3, word) <= 70:
            ttl = ttl + word + ' '
            
    return ttl+ttl3, ttl1[:-1], ttl3
        

In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 0 to 491
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   full_title        492 non-null    object
 1   job_title         492 non-null    object
 2   field             492 non-null    object
 3   profession_title  492 non-null    object
 4   employment_type   492 non-null    object
 5   experience_lvl    492 non-null    object
 6   company           492 non-null    object
 7   location          492 non-null    object
 8   company_size      492 non-null    object
 9   company_revenue   492 non-null    object
 10  company_type      492 non-null    object
 11  industry          492 non-null    object
 12  salary            492 non-null    int64 
dtypes: int64(1), object(12)
memory usage: 53.8+ KB


In [132]:
df['company_size'] = df.company_size.apply(convert_size)
df['company_revenue'] = df.company_revenue.apply(convert_revenue)
df['salary'] = df.salary.apply(convert_salary)
df['location'] = df.location.fillna('Remote')
df.rename(columns={'title':'full_title'}, inplace = True)
df['gen_title'] = df.full_title.apply(generalize_title)
df['job_title'] = df.gen_title.apply(lambda x: x[0])
df['field'] = df.gen_title.apply(lambda x: x[1] if x[1] else 'None')
df['field'] = df.field.apply(lambda x: re.sub(r'cal\b','cs',x,re.IGNORECASE))
df.loc[df.field.str.contains(pat=r'Applied$', regex=True), 'field'] = 'Applied Science'
df.loc[df.field.str.contains(pat=r'Statistical', regex=True), 'field'] 
df['profession_title'] = df.gen_title.apply(lambda x: x[2])
del df['gen_title']
df = df.loc[:,['full_title','job_title', 'field', 'profession_title', 'employment_type', 'experience_lvl', 
               'company', 'location', 'company_size', 'company_revenue', 'company_type', 'industry', 'salary']]
df.info()

In [126]:
with open('/Users/threnylaird/Documents/personal project/glassdoor/glassdoor_salary.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file, index=True)