In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Indeed

#### Create Skills Dictionary

In [2]:
# dictionary with skills used to parse job descriptions, categories become columns in the resulting DataFrame
skills_keywords_dict = {
        # tech skills
        'Excel': ['Excel'],
        'Python': ['Python'],
        'R': ['R ', ' R ', 'R,', 'R/'], # 'R' surrounded by spaces and signs
        'Java': ['Java', 'JVM'],
        'Scala': ['Scala'],
        'C/C++': ['C/C++', 'C++', ' C '],   # 'C' surrounded by spaces and signs
        'MATLAB': ['MATLAB'],
        'SAS': ['SAS'],
        'SQL/databases': ['SQL', 'databases'],
        'Oracle':['Oracle'],
        'SPSS': ['SPSS'],
        'Stata': ['Stata'],
        'Machine Learning': ['Machine Learning', 'ML'],
        'Data Mining/Analytics': ['Data Mining', 'DM', 'Analytics'],
        'NLP': ['Natural Language Processing', 'NLP'],
        'Visualisation': ['Visualisation', 'Visualization'],
        'Big Data': ['Big Data', 'Spark', 'kafka', 'Hive',
                     'beam', 'Hadoop', 'MapReduce', 'Hbase',
                     'Coudera', 'Hortonworks'],
        'AWS Cloud': ['AWS'],
        'Probability': ['probablity', 'probability theory'],
        'Support Vector Machines': ['SVM', 'Support vector machines'],
        'Neural Networks': ['Neural Networks', 'ANN', 'MLP', 'CNN', 'Tensorflow', 'Keras', 'Theano'],
        'GCP': ['GCP'],
        'Jason': ['Jason'],
        'xml': ['xml'],
        'Azure': ['Azure'],
        'Google Cloud': ['Google Cloud'],
        'Mathematics': ['Mathematics'],
        'IBM': ['IBM'],
        'Algebra': ['Algebra'],
        'Statistics' :  ['Statistics'],
        'Operations research': ['Operations research'],
        'DevOps': ['DevOps', 'TDD', 'test-driven'],
        'Git':['GitHub', 'Git', 'version control'],
        # soft skills
        'presentation' : ['communication', 'presentation'],
        'management' : ['management', 'Data management'],
        'agile' : ['agile'],
        'SDLC' : ['SDLC', 'sdlc', 'software development', 'lifecycle'],
        'decision making' : ['decision making', 'decision analysis'],
        'problem solving': ['problem solving'],
        'Team building': ['Team leadership', 'team building'],
        'project_management': ['project management'],
        'leadership': ['leadership'],
        'consulting': ['consulting', 'consultant']
    }

#### Get urls from Indeed pages

In [4]:
# 20 job postings per page
def get_urls(url):
    # get html back
    html = requests.get(url)
    html = html.text
    soup = BeautifulSoup(html, 'lxml')

    # extract the number of search results
    nums = soup.find('div', {'id':'searchCount'}).text
    nums = int(nums.split()[3])

    # add the common part between all search pages
    base_url = "https://www.indeed.ca" + soup.find('div', {'class': 'pagination'}).find('a').get('href')[:-2]
    
    urls = []
    urls.append(base_url)
    for i in range(20, nums, 20):
        urls.append(base_url + str(i))
    
    return urls, nums

In [5]:
url = 'https://ca.indeed.com/jobs?q=Data+scientist&l=Canada'
urls, nums = get_urls(url)

#### Get link for each job and its html text

In [6]:
def get_job_links_info(urls):
    
    dic = {}
    # loop over all page-urls
    for page_url in urls:

        # get the HTML of the search results page
        page = requests.get(page_url)
        content = page.text
        soup = BeautifulSoup(content, 'lxml')

        # find all <div> tags containing each job posting links and feed them to the function 'scrape_job_info'
        results = soup.find_all('div',{'class': 'title'})

        for job in results:

            job_link = "https://www.indeed.ca" + job.find('a')['href']

            job_page = requests.get(job_link)
            job_content = job_page.text
            #soup_job = BeautifulSoup(job_content,'lxml')
            #job_desc = soup_job.find('div',{'class':"jobsearch-jobDescriptionText"}).get_text()
            dic[job_link] = job_content
            
    return dic

In [7]:
info_dict = get_job_links_info(urls)

#### Get job_title, company_name, skills from info_dict

In [8]:
def get_skills_from_job(jobs_dict, skills_dict):
    
    results_dict = {} 
    
    # loop over all key(link)-value(HTML code) pairs in scraping results
    for link, job_html_text in jobs_dict.items():
        
    
        soup_job = BeautifulSoup(job_html_text, 'lxml')
        
        results_dict[link] = {} 
        
        # extract job title
        try:
            results_dict[link]['job_title'] = soup_job.find('h3', 
                                {'class':"icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"}).text   
        except IndexError:
            results_dict[link]['job_title'] = 'Not found'
            
         # extract company name
        try:
            results_dict[link]['company_name'] = soup_job.find('div', 
                                                                   {'class': 'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).text 
        except IndexError:
            results_dict[link]['company_name'] = 'Not found'
                
        # search for the skills
        job_text = soup_job.text
        for skill_category, skills in skills_dict.items():
            
            category_found = 0  
            
            for skill in skills:        
                if job_text.find(skill) != -1: 
                    
                    category_found = 1
            
            results_dict[link][skill_category] = category_found 
    
    return results_dict 

In [9]:
results_dict = get_skills_from_job(info_dict, skills_keywords_dict)
df_indeed = pd.DataFrame(results_dict).T.reset_index()

In [47]:
df_indeed.to_csv('tech_indeed.csv')

# Linkedin

In [17]:
# 20 job postings per page
def get_urls_linkedin(url):
    # get html back
    html = requests.get(url)
    html = html.text
    soup = BeautifulSoup(html, 'lxml')

    # extract the number of search results
    nums = soup.find('h1').find('span').text
    nums = int(nums)

    # add the common part between all search pages
    base_url = 'https://ca.linkedin.com/jobs/data-scientist-jobs?position=1&pageNum='
    
    urls = []
    for i in range(0, nums, 25):
        urls.append(base_url + str(i))
    
    return urls, nums

In [30]:
url = "https://ca.linkedin.com/jobs/data-scientist-jobs"
urls,nums = get_urls_linkedin(url)

In [31]:
def get_job_links_info_linkedin(urls):
    
    dic = {}
    # loop over all page-urls
    for page_url in urls:

        # get the HTML of the search results page
        page = requests.get(page_url)
        content = page.text
        soup = BeautifulSoup(content, 'lxml')

        # find all <div> tags containing each job posting links and feed them to the function 'scrape_job_info'
        results = soup.find_all('a',{'class':"result-card__full-card-link"})

        for job in results:

            job_link = job['href']

            job_page = requests.get(job_link)
            job_content = job_page.text
            #soup_job = BeautifulSoup(job_content,'lxml')
            #job_desc = soup_job.find('div',{'class':"jobsearch-jobDescriptionText"}).get_text()
            dic[job_link] = job_content
            
    return dic

In [32]:
linkedin_dict = get_job_links_info_linkedin(urls)

In [38]:
def get_skills_from_job_linkedin(jobs_dict, skills_dict):
    
    results_dict = {} 
    
    # loop over all key(link)-value(HTML code) pairs in scraping results
    for link, job_html_text in jobs_dict.items():
        
    
        soup_job = BeautifulSoup(job_html_text, 'lxml')
        
        results_dict[link] = {} 
        
        # extract job title
        try:
            results_dict[link]['job_title'] = soup_job.find('h1',{'class':'topcard__title'}).text   
        except:
            results_dict[link]['job_title'] = 'Not found'
            
         # extract company name
        try:
            results_dict[link]['company_name'] = soup_job.find('a',{'class':'topcard__org-name-link topcard__flavor--black-link'}).text
        except:
            results_dict[link]['company_name'] = 'Not found'
                
        # search for the skills
        job_text = soup_job.text
        for skill_category, skills in skills_dict.items():
            
            category_found = 0  
            
            for skill in skills:        
                if job_text.find(skill) != -1: 
                    
                    category_found = 1
            
            results_dict[link][skill_category] = category_found 
    
    return results_dict 

In [39]:
results_dict = get_skills_from_job_linkedin(linkedin_dict, skills_keywords_dict)
df_linkedin = pd.DataFrame(results_dict).T.reset_index()

In [48]:
df_linkedin.to_csv('tech_linkedin.csv')

In [41]:
df_indeed

Unnamed: 0,index,job_title,company_name,Excel,Python,R,Java,Scala,C/C++,MATLAB,...,presentation,management,agile,SDLC,decision making,problem solving,Team building,project_management,leadership,consulting
0,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Senior Data Scientist,BMO Financial Group,1,1,1,0,0,0,0,...,1,1,0,0,0,1,0,0,0,0
1,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Data Engineer,Prodigy Game,0,1,0,1,1,0,0,...,1,1,0,0,0,0,0,1,0,0
2,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,"Business Development, Data Science and Analytics",Blackline Safety,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
3,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,DATA SCIENTIST,Allstate Canada,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Big Data Instructor,"Cestar College of Business, Health & Technology",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Big Data Instructor,"Cestar College of Business, Health & Technology",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
673,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Business Intelligence Analyst (Power BI),TCU Financial Group,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
674,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Data Engineer,Prodigy Game,0,1,0,1,1,0,0,...,1,1,0,0,0,0,0,1,0,0
675,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,DATA SCIENTIST,Allstate Canada,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [44]:
df_tech = df_indeed.append(df_linkedin,ignore_index=True)

In [45]:
df_tech

Unnamed: 0,index,job_title,company_name,Excel,Python,R,Java,Scala,C/C++,MATLAB,...,presentation,management,agile,SDLC,decision making,problem solving,Team building,project_management,leadership,consulting
0,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Senior Data Scientist,BMO Financial Group,1,1,1,0,0,0,0,...,1,1,0,0,0,1,0,0,0,0
1,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Data Engineer,Prodigy Game,0,1,0,1,1,0,0,...,1,1,0,0,0,0,0,1,0,0
2,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,"Business Development, Data Science and Analytics",Blackline Safety,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
3,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,DATA SCIENTIST,Allstate Canada,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,https://www.indeed.ca/pagead/clk?mo=r&ad=-6NYl...,Big Data Instructor,"Cestar College of Business, Health & Technology",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,https://ca.linkedin.com/jobs/view/senior-data-...,Senior Data Scientist - Multiple Roles,Shopify,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1466,https://ca.linkedin.com/jobs/view/data-scienti...,Data Scientist,CGI,0,1,1,1,1,0,0,...,1,0,0,1,0,0,0,0,0,0
1467,https://ca.linkedin.com/jobs/view/data-analyst...,Data Analyst,OMERS,0,0,0,0,0,0,0,...,1,1,0,1,1,0,0,0,0,0
1468,https://ca.linkedin.com/jobs/view/data-scienti...,Data Scientist | VC-backed | Fast Growing SaaS...,Process Street,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [46]:
df_tech.to_csv('technical_job.csv')