 # import libraries

In [None]:
# requests lib to handle the GET requests
import requests
# beautiful soup to parse the html from the GET response
from bs4 import BeautifulSoup as bsoup
# regex to format and clean text strings
import re
# time module used to mark the processing start time
from time import time
# sleep function used to limit the request cycle time to avoid overloading the server
from time import sleep

# used to monitor the processing stats during runtime
from IPython.core.display  import clear_output

# authentification for access and writing to google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# used to manipulate date into easy to convert forms for storage
import numpy as np
import pandas as pd

# set initial variables

In [None]:
# setting the url query variables to be used in get request
query = 'data+scientist'
location = 'United+States'
job_type = 'fulltime'
level = 'entry_level'
start = 0

In [None]:
# set the base url for the request
url = 'https://www.indeed.com/jobs'
payload = {
    'q' : query,
    'l' : location,
    'jt' : job_type,
    'explvl' : level,
    'start' : str(start)
}


# data retrieval function

In [None]:
response = requests.get(url, params=payload)
# creates a new list for the split text fragments

new_list = []
start_time = time()
pages = 0
limit = 5

while pages < limit:
  clear_output(wait=True)

  if (response.ok):
    data = response.text
    soup = bsoup(data, 'html.parser')

  # code for extracting the number of pages in the search and setting it equal to the max limit
  # find the div containing the page number our of the total number of pages
  page_count_raw = soup.find('div', id = 'searchCountPages')

  # convert the div contents to a string and separate by spaces
  page_count_split = re.split(' ', str(page_count_raw))

  # set page count equal to the second to last item in the split list which is a string representing the total number of pages
  page_count = page_count_split[-2]

  # remove the comma from the string so that it can be converted to an int
  page_count = int(re.sub(',', '', page_count))
  max_limit = page_count
  limit = max_limit

  # if the page limit used is greater than the total number of pages available then the limit is set to the max number
  if limit > max_limit:
    limit = max_limit

  # separate the html data into a list of items with the div tag and 'summary' class
  summary_div = soup.find_all('div', class_ = 'summary')
  #print(len(summary_div))


  # create new list to add new list elements from the div list to
  summary_list = []
  word_list = []
  # iterates though each div containter and finds the <li> elements and adds them to the new list
  for summary in summary_div:
    summary_list.append(summary.find_all('li'))

  # iterates though the summary_list and splits them at the '>' tag character to remove tag characters from the front of the text that we want
  for item in summary_list:
    for re_item in item:
      re_item = str(re_item)
      sub_one = re.sub('<b>', '', re_item)
      #print(sub_one)
      sub_two = re.sub('</b>', '', sub_one)
      #print(sub_two)
      sub_three = re.sub('<li style="margin-bottom:0px;">', '', sub_two)
      #print(sub_three)
      sub_four = re.sub('</li>', '', sub_three)
      #print(sub_four)
      sub_five = re.sub('<li>', '', sub_four)
      #print(sub_five)
      new_list.append(sub_five[0:-1])
      final_split = re.split(' ', sub_five[0:-1])
      for word in final_split:
        word_list.append(word)
  
  sleep(0.5)
  
  # output some logs for monitoring
  elapsed_time = time() - start_time
  pages += 1
  print('Requests: {}, Frequency: {} requests/s, {} strings processed.'.format(pages, pages/elapsed_time, len(new_list)))
  start += 10
  
  print(str(pages)+' pages processed...')

print('Processing complete!')


Requests: 6689, Frequency: 1.5746016815345876 requests/s, 207359 strings processed.
6689 pages processed...
Processing complete!


# data writing and storage

In [None]:
# install pydrive for writing data to csv

!pip install -U -q PyDrive

In [None]:
# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# turns the new_list list variable into a pandas series
new_list_array = pd.Series(new_list)

# the newly created 1 dimensional array is converted to a csv file with the pandas .to_csv() method function
new_list_array.to_csv('pd_job_data.csv')

In [None]:
# new list variable created so that when the individual list strings in the new_list variable are split into a list of word strings, they can be added to this new list of lists
new_list_split = []

# the individual string lists are then formatted a little
for line in new_list:
  # all commas in the strings are removed
  new_line = re.sub(',', '', line)
  # each sentence string is then split into a list of the individual words
  split_line = re.split(' ', new_line)
  # each list of words is then added to our new_list_split list 
  new_list_split.append(split_line)

# the list of word lists are then made into a data frame variable in pandas
pd_word_list = pd.DataFrame(new_list_split)

# the dataframe variable is then saved to a csv file with each sentence on a separate row and a separate column cell for each word in that sentence
pd_word_list.to_csv('pd_job_data_words.csv')