In [None]:
import random, json
import numpy as np
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time, os


#Collecting maximum results of 500 
max_results_per_city = 500
# Number of jobs show on each result page.
page_record_limit = 50
num_pages = int(max_results_per_city/page_record_limit)

def get_jobs_info(search_location):
    '''
    Scrape from web or read from saved file
    Input: 
        search_location - search job in a certain city. Input from command line.
    Output: 
        jobs_info - a list that has info of each job i.e. link, location, title, company,desc
    '''
    exists = os.path.isfile(JOBS_INFO_JSON_FILE)
    jobs_info = web_scrape(search_location)
    if exists:
        with open(JOBS_INFO_JSON_FILE, 'r') as fp:
            jobs_info = json.load(fp)            
    else:
        jobs_info = web_scrape(search_location)
        
    return jobs_info
        
def web_scrape(search_location):
    '''
    Scrape jobs from indeed.ca
    Input: 
        search_location - search job in a certain city. Input from commond line.
    Output: 
        jobs_info - a list that has info of each job i.e. link, location, title, company, salary, desc
    '''
    # urls of all jobs
    job_links = []
    # Record time for web scraping
    # start time
    start = time.time() 
    # Launch webdriver
    driver = webdriver.Chrome(WEBDRIVER_PATH)
    job_locations = JOB_LOCATIONS
    # If search location is defined, only search that location
    if (len(search_location) > 0):
        job_locations = [search_location]
        
    # Extract all job urls 
    for location in job_locations: 
        url = 'https://www.indeed.com/jobs?q='+ JOB_SEARCH_WORDS + '&l=' \
        + location + '&limit=' + str(page_record_limit) + '&fromage='+ str(DAY_RANGE)
        # Set timeout
        driver.set_page_load_timeout(80)
        webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
        driver.get(url)
        time.sleep(5)  
        for i in range(num_pages):            
            try:
                # Each job on the page's its url
                from selenium.webdriver.common.by import By
                job_names = []
                company_names = []
                job_locations =[]
                job_descs = []
                for job_each in driver.find_elements(By.XPATH, "//h2[@class='jobTitle css-1h4a4n5 eu4oa1w0']"):
                    job_name = job_each.text
                    job_names.append(job_name)
                for job_each in driver.find_elements(By.XPATH, "//a[@data-tn-element='companyName']"):
                    company_name = job_each.text
                    company_names.append(company_name)
                for job_each in driver.find_elements(By.XPATH, "//div[@class ='companyLocation']"):
                    job_location = job_each.text
                    job_locations.append(job_location)
                for job_each in driver.find_elements(By.XPATH, "//div[@id = 'jobDescriptionText']"):
                    job_desc = job_each.text
                    job_descs.append(job_desc)
                cd = [job_each for job_each in driver.find_elements(By.XPATH, "//div[@class = 'slider_container css-g7s71f eu4oa1w0']")]
                
                result = []
                for i in range(5):
                    driver.get(url)
                    time.sleep(5)  
                    cd = [job_each for job_each in driver.find_elements(By.XPATH, "//div[@class = 'slider_container css-g7s71f eu4oa1w0']")]
                    if i<len(cd):
                        cd[i].click()
                        time.sleep(5) 
                        result.append([jd.text for jd in driver.find_elements(By.XPATH, "//div[@id = 'jobDescriptionText']")])
                job_desc = [i[0] for i in result]
                # print("\n\nresult:",result)
                # slider_container css-g7s71f eu4oa1w0
                # print("job_names: \n", job_names)
                # print("company_names: \n", company_names)
                # print("location:\n", job_locations)
                # print("job_Desc:\n", job_descs)
                zipped = zip(job_names, company_names, job_locations, job_desc)
                zipped_list = list(zipped)
                # print(zipped_list)
                print ('scraping {} page {}'.format(location, i+1))
                # Go next page
                driver.find_element(By.LINK_TEXT,'Next »').click()
            except NoSuchElementException:
                # If nothing find, we are at the end of all returned results
                print ("{} finished".format(location))
                break        
            time.sleep(3)
    with open(JOBS_INFO_JSON_FILE, 'w') as fp:
        json.dump(zipped_list, fp)
    # Close and quit webdriver
    driver.quit()    
    end = time.time() # end time
    # Calculate web scaping time
    scaping_time = (end-start)/60.
    print('Took {0:.2f} minutes scraping {1:d} data scientist jobs'.format(scaping_time, len(zipped_list)))
    return zipped_list