<b>

<p>
<center>
<font size="5">
Natural Language Processing (Spring 2020)
</font>
</center>
</p>

<p>
<center>
<font size="4">
PROJECT: AUTOMATIC TERMINOLOGY ANALYSIS FOR DATA SCIENCE JOB DESCRIPTIONS 
</font>
</center>
</p>

<p>
<center>
<font size="3">
Authors: Anwesha Tomar, Marta Matosas Fonolleda, Sandra Valdes Salas
</font> 
</center>
</p>

<p>
<center>
<font size="3">
Part 1: Web scraping code
</font> 
</center>
</p>



### Setup Google Colab

In [0]:
#from google.colab import drive
#drive.mount('/content/drive')

In [0]:
#%cd /content/drive/My\ Drive/Colab\ Notebooks/NLP

# 1) Import libaries

In [0]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen 
import re
from IPython.core.display import clear_output
import pandas as pd
from time import time
from time import sleep
from random import randint

# 2) Build functions for web scraping 

## 2.1 Convert URL based on query (position and city)

In [0]:
def convert_url(position, city):
    '''
    Function converts base url according to query
    Returns: string
    '''
    position = position.replace(' ','+')
    city = city.replace(" ","+")
    search_url = "https://www.indeed.com/jobs?q={}&l={}&sort=date"
    return search_url.format(position, city)

In [0]:
#Sample
convert_url('data science', 'washington dc')

'https://www.indeed.com/jobs?q=data+science&l=washington+dc&sort=date'

## 2.2 Get total pages of query

In [0]:
def get_max_pages(search_url):
    '''
    Function calculates maximum number of pages of search url
    Returns: integer
    '''
    url = search_url+"&start=00"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    job_link_area = soup.find(id = 'resultsCol')
    pages = job_link_area.find("div", { "id" : "searchCountPages" }).text.strip().split()[3].replace(',','')
    max_pages = int(int(pages)/10)*10
    if max_pages > 1000:
        return 990
    else:
        return max_pages

In [0]:
#Sample
get_max_pages('https://www.indeed.com/jobs?q=data+science&l=washington+dc&sort=date')

990

In [0]:
#Sample
search_url = 'https://www.indeed.com/jobs?q=data+science&l=washington+dc&sort=date'
page_url = search_url +"&start=" +str(10)
page_url

'https://www.indeed.com/jobs?q=data+science&l=washington+dc&sort=date&start=10'

## 2.3 Set crawler: get all job url's displayed in a page 

In [0]:
def get_job_urls(soup_obj, base_url='https://www.indeed.com'):
    '''
    Function extracts job url link for each job posted in the web page
    Returns: list of url (string)
    '''
    #base_url = base_url
    job_link_area=soup_obj.find(id = 'resultsCol')
    job_urls = []
    for a in [link.find_all('a') for link in job_link_area.find_all('h2') if link.get('class') == ['title']]:
        job_urls.append(base_url+a[0].get('href'))
    return job_urls

In [0]:
# Sample
base_url_page = 'https://www.indeed.com'
url_page = 'https://www.indeed.com/jobs?q=data+science&l=washington+dc&sort=date&start=20'
r = requests.get(url_page)
soup = BeautifulSoup(r.content, 'lxml')
jobs_urls = get_job_urls(soup)
jobs_urls[0]

'https://www.indeed.com/rc/clk?jk=bb6228270244aeae&fccid=05d6cb8b919478a9&vjs=3'

## 2.4 Extract job information

### 2.4.1 Extract business sector for each job posting

In [0]:
#extract business sector 
def fetch_sector(soup_obj):
    '''
    Function gets link associated to company information and get sector of the company.
    Returns: string
    '''
    try:
        company_link=soup_obj.find('div', class_="icl-u-lg-mr--sm icl-u-xs-mr--xs").find('a').get('href')
        page = urlopen(company_link) 
        soup = BeautifulSoup(page)
        return soup.find('a', class_="cmp-AboutMetadata-plainLink").text
    except:
        return None

### 2.4.2 Create dictionary with url, position, company, description, sector for each job posting

In [0]:
def extract_job_information(job_url):
    '''
    Function extracts all information from a given job url.
    Returns: dictionary
    '''
    job={}
    page =  urlopen(job_url)
    soup = BeautifulSoup(page)
    
    job['url']=job_url 
    job['position'] = soup.find('div', class_='jobsearch-JobInfoHeader-title-container').text
    try:
        job['company'] = soup.find('div', class_ = 'icl-u-lg-mr--sm icl-u-xs-mr--xs').text
    except: 
        job['company'] = None
    job['description'] = soup.find('div', class_='jobsearch-jobDescriptionText').text
    job['sector'] = fetch_sector(soup)
    return job


## 2.5 Function for web scraping

In [0]:
def job_scraper(position, city):
    '''
    Function scrapes Indeed web page based on query position and city and extracts 
    raw html for every job position found in the page.
    Returns: dictionary
    '''
    # Prepare url
    url_search = convert_url(position, city)
    
    # Get max pages
    #max_pages = get_max_pages(url_search)
    max_pages = 10 #sample
    
    # set counters
    #start_time = time()
    req = 0 #requests counter
    jobs_extracted = 0 # jobs counter

    data=[]
    
    # iterate through each page
    for num in range(0, max_pages, 10):
        url_current_page = url_search +"&start=" +str(num)
        r = requests.get(url_current_page)
        soup = BeautifulSoup(r.content, 'lxml')
        
        # Monitor requests and status of requests
        req +=1 
        #t = time() - start_time  
        if r.status_code != 200:  
            print('Request: {} | Status code: {}'.format(req, r.status_code))
            break 

        # Get list of url's of jobs
        job_urls = get_job_urls(soup, base_url='https://www.indeed.com')
        jobs_extracted += len(job_urls)
        
        # For each url-job, extract raw html and append to list of raw htmls
        for url in job_urls: 
            job = extract_job_information(url)
            data.append(job)
   
        # Get total requests and total job url's retrieved
        print("Request: {} | Total jobs extracted: {}".format(req, jobs_extracted))
        clear_output(wait = True) 

    return data

# 3) Scrape data

In [0]:
#Implement this to scrape data 

data = job_scraper("data scientist", "washington dc")
#data = job_scraper("software engineer", "washington dc")

Request: 1 | Total jobs extracted: 15


In [0]:
data[0]

{'company': 'Guzman & Griffin Technologies Inc',
 'description': "Please Apply using this link: https://app.smartsheet.com/b/form/2cb8018ed6a041b0870e3cd056c286abOur FocusIn aviation safety, we seek to minimize the potential for harm to the flying public. To support thiseffort, we collect vast amounts of operational and simulation data. How can we use statisticalanalyses and data science to analyze this data and effect meaningful change to aviation safety? Ifthis sounds interesting to you, GGTI seeks a Data Scientist to join its Data Science & Analytics Team.What’s the Job?· Preprocessing, cleansing, and verifying integrity of data that can be provided as input foradvanced analytics· Ad-hoc analysis and presentations of results for technical and non-technical audiences· Feature selection, model building and optimization using machine learning techniques· Building regression, classification, association models· Building anomaly detection systemsAbout YouYou are an experienced Data Scien

# 4) Store data in json file

In [0]:
def store_data(filename, dictionary):
    import json
    with open(filename, 'w') as f:
        file = json.dump(dictionary, f, indent=4)
    return file

In [0]:
#Implement this to store json file in working directory 

#store_data('dc_datascience.json', data) 
#store_data('dc_software-engineer.json', data)