In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Indeed

#### Get urls from Indeed pages

In [25]:
# 20 job postings per page
def get_urls(url):
    nums = 1000
    # get html back
    html = requests.get(url)
    html = html.text
    soup = BeautifulSoup(html, 'lxml')

    # add the common part between all search pages
    base_url = "https://www.indeed.ca" + soup.find('div', {'class': 'pagination'}).find('a').get('href')[:-2]
    
    urls = []
    urls.append(base_url)
    for i in range(10, nums, 10):
        urls.append(base_url + str(i))
    
    return urls, nums

In [26]:
url = 'https://ca.indeed.com/jobs?q=data+manager&l=Canada'
urls, nums = get_urls(url)

#### Get link for each job and its html text

In [10]:
def get_job_links_info(urls):
    
    dic = {}
    # loop over all page-urls
    for page_url in urls:

        # get the HTML of the search results page
        page = requests.get(page_url)
        content = page.text
        soup = BeautifulSoup(content, 'lxml')

        # find all <div> tags containing each job posting links and feed them to the function 'scrape_job_info'
        results = soup.find_all('div',{'class': 'title'})

        for job in results:

            job_link = "https://www.indeed.ca" + job.find('a')['href']

            job_page = requests.get(job_link)
            job_content = job_page.text
            #soup_job = BeautifulSoup(job_content,'lxml')
            #job_desc = soup_job.find('div',{'class':"jobsearch-jobDescriptionText"}).get_text()
            dic[job_link] = job_content 
    return dic

In [27]:
info_dict = get_job_links_info(urls)

#### Get job_title, company_name, skills from info_dict

In [28]:
def get_skills_from_job(jobs_dict, skills_dict):
    
    results_dict = {} 
    
    # loop over all key(link)-value(HTML code) pairs in scraping results
    for link, job_html_text in jobs_dict.items():
        
    
        soup_job = BeautifulSoup(job_html_text, 'lxml')
        
        results_dict[link] = {} 
        
        # extract job title
        try:
            results_dict[link]['job_title'] = soup_job.find('h3', 
                                {'class':"icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"}).text   
        except IndexError:
            results_dict[link]['job_title'] = 'Not found'
            
         # extract company name
        try:
            results_dict[link]['company_name'] = soup_job.find('div', 
                                                                   {'class': 'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).text 
        except IndexError:
            results_dict[link]['company_name'] = 'Not found'
                
        # search for the skills
        job_text = soup_job.text
        for skill_category, skills in skills_dict.items():
            
            category_found = 0  
            
            for skill in skills:        
                if job_text.find(skill) != -1: 
                    
                    category_found = 1
            
            results_dict[link][skill_category] = category_found 
    
    return results_dict 

In [30]:
# dictionary with skills used to parse job descriptions, categories become columns in the resulting DataFrame
skills_keywords_dict = {
        # tech skills
        'Excel': ['Excel'],
        'Python': ['Python'],
        'R': ['R ', ' R ', 'R,', 'R/'], # 'R' surrounded by spaces and signs
        'Java': ['Java', 'JVM'],
        'Scala': ['Scala'],
        'C/C++': ['C/C++', 'C++', ' C '],   # 'C' surrounded by spaces and signs
        'MATLAB': ['MATLAB'],
        'SAS': ['SAS'],
        'SQL/databases': ['SQL', 'databases'],
        'Oracle':['Oracle'],
        'SPSS': ['SPSS'],
        'Stata': ['Stata'],
        'Machine Learning': ['Machine Learning', 'ML'],
        'Data Mining/Analytics': ['Data Mining', 'DM', 'Analytics'],
        'NLP': ['Natural Language Processing', 'NLP'],
        'Visualisation': ['Visualisation', 'Visualization'],
        'Big Data': ['Big Data', 'Spark', 'kafka', 'Hive',
                     'beam', 'Hadoop', 'MapReduce', 'Hbase',
                     'Coudera', 'Hortonworks'],
        'AWS Cloud': ['AWS'],
        'Probability': ['probablity', 'probability theory'],
        'Support Vector Machines': ['SVM', 'Support vector machines'],
        'Neural Networks': ['Neural Networks', 'ANN', 'MLP', 'CNN', 'Tensorflow', 'Keras', 'Theano'],
        'GCP': ['GCP'],
        'Jason': ['Jason'],
        'xml': ['xml'],
        'Azure': ['Azure'],
        'Google Cloud': ['Google Cloud'],
        'Mathematics': ['Mathematics'],
        'IBM': ['IBM'],
        'Algebra': ['Algebra'],
        'Statistics' :  ['Statistics'],
        'Operations research': ['Operations research'],
        'DevOps': ['DevOps', 'TDD', 'test-driven'],
        'Git':['GitHub', 'Git', 'version control'],
        # soft skills
        'presentation' : ['communication', 'presentation'],
        'management' : ['management', 'Data management'],
        'agile' : ['agile'],
        'SDLC' : ['SDLC', 'sdlc', 'software development', 'lifecycle'],
        'decision making' : ['decision making', 'decision analysis'],
        'problem solving': ['problem solving'],
        'Team building': ['Team leadership', 'team building'],
        'project_management': ['project management'],
        'leadership': ['leadership'],
        'consulting': ['consulting', 'consultant']
    }

In [33]:
results_dict = get_skills_from_job(info_dict, skills_keywords_dict)
df_mng = pd.DataFrame(results_dict).T.reset_index()

In [34]:
df_mng

Unnamed: 0,index,job_title,company_name,Excel,Python,R,Java,Scala,C/C++,MATLAB,...,presentation,management,agile,SDLC,decision making,problem solving,Team building,project_management,leadership,consulting
0,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Lead Data Manager (Canada),PRA Health Sciences,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Facility Operation and Maintenance Manager (NO...,Vale Canada Ltd,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,YouTube Channel Manager,Viral Nation Inc.,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,"Retention Manager, Digital Product Management",OLG,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
4,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Fleet Manager,Pineapple Express Delivery,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1710,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,"Data Industrialization, Processing & Methods A...",Gemalto Canada Inc.,1,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1711,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Telesales Manager,Merchant 1 Payments,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1712,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Channel Account Manager- Remote,Johnson Controls,1,0,0,0,0,0,0,...,1,1,0,0,0,1,0,0,1,0
1713,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Facility Operation and Maintenance Manager (NO...,Vale Canada Ltd,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [35]:
df_mng.to_csv('management_jobs.csv')