In [1]:
# Setting up environment
# ----------------------
# Import libraries
import psycopg2
import cloudscraper
import time
from bs4 import BeautifulSoup
import sys
import os

# Set custom path for secrets
sys.path.insert(0,"/home/nuclear/Github/PythonPrograms/Secrets")

In [2]:
# Feedback for sql commands
# -------------------------
# Function to run try-except wrapped sql commands 
def run_sql (conn, sql_cmnd):
    cur = conn.cursor()
    try:
        cur.execute(sql_cmnd)
        print('success: ' + sql_cmnd)
    except:
        print('FAIL: ' + sql_cmnd)
    conn.commit()
    return 0
    

In [3]:
# Connecting to local database
# ----------------------------
# Connect to database
import crawler_pg_secrets as p
pg_params = {
    'host' : p.host,
    'user' : p.user,
    'dbname' : p.dbname,
    'password' : p.password,
    'port' : p.port
}
conn =  psycopg2.connect(**pg_params)
cur = conn.cursor()
conn.commit()

In [4]:
# Creating job url table
# ----------------------
table_name = 'indeed_horizon'

# Drop previous horizon table
sql_cmnd = "drop table " + table_name + ";"
run_sql(conn,sql_cmnd)

# Create new horizon table
sql_cmnd = "create table "\
    + table_name\
    + " (id serial, url varchar(2046), visited integer);"
run_sql(conn,sql_cmnd)

# Create MD5 hashed index
sql_cmnd = 'CREATE UNIQUE INDEX url_md5 ON indeed_horizon(MD5(url));'
run_sql(conn,sql_cmnd)

success: drop table indeed_horizon;
success: create table indeed_horizon (id serial, url varchar(2046), visited integer);
success: CREATE UNIQUE INDEX url_md5 ON indeed_horizon(MD5(url));


0

In [5]:
# Generate page urls
# -------------------
# Defining indeed endpoint, example: jobs?q=data+analyst&l=Toronto'&start=40
def create_end_url(job = 'data analyst', location = 'toronto', page = 3):
    connector_01 = '&l='
    connector_02 = "&start="
    job_url = job.strip().replace(" ","+")
    page_url = (page - 1) *10
    location_url = location
    end_url = '/jobs?q=' + job_url + connector_01 + location_url + connector_02 + str(page_url)
    print(end_url)
    return end_url

# Defining indeed full url, example: https://ca.indeed.com/jobs?q=data+analyst&l=Toronto'&start=40
def create_full_url(end_url):
    base_url = 'https://ca.indeed.com'
    full_url =  base_url + end_url
    print(full_url)
    return full_url

end_url = create_end_url('data entry', 'toronto',1) # testing url = create_url('data entry', 'ottawa',9)
full_url = create_full_url(end_url)


/jobs?q=data+entry&l=toronto&start=0
https://ca.indeed.com/jobs?q=data+entry&l=toronto&start=0


In [6]:
#Build table of horizons
# -----------------------
# Function to enter url into database
def db_input (url, visited = 0, table_name = 'indeed_horizon'):
    sql_cmnd = "insert into indeed_horizon (url, visited) values ('" + url + "'," +str(visited)+ ");"
    exit_status = run_sql(conn,sql_cmnd)
    return exit_status

# Extract urls from indeed
# ------------------------
# Parse page (from response object) for job urls
def scrape_soup(soup,advert_count):
    print("advert type count begining: ", type(advert_count))
    job_url_base = 'https://ca.indeed.com'
    ident='jcs-JobTitle css-jspxzf eu4oa1w0'
    class_link = soup.find_all(class_=ident) # Div class containing job link
    for count,link in enumerate(class_link):
        advert_count += 1
        job_url_end = link.get('href')
        job_url = job_url_base + job_url_end
        exit_status = db_input(job_url)
        #print("advert_count before, scrape_soup: ",  advert_count)
        # if exit_status == 0:
        #     advert_count += 1
        #print("advert_count after, scrape_soup: ",  advert_count)
        print("advert type count end: ", type(advert_count))
        print(advert_count)
        # print(job_url)
    return advert_count

In [7]:
# Scraping with cloudscraper
# --------------------------
# Scraping instance
print(full_url)
scraper = cloudscraper.create_scraper()

# Cycling through pages
exit_flag = 0
advert_count = 0
max_adverts = 500
while exit_flag == 0 or advert_count < max_adverts:
    response = scraper.get(full_url)
    soup = BeautifulSoup(response.text,"html.parser")
    # Extract adverts from page
    print("advert_count before, in page loop: ",  type(advert_count))
    advert_count = scrape_soup(soup, advert_count)
    print("advert_count after, in page loop: ",  type(advert_count))
    print(response.status_code)
    # Find next page
    try:
        next_page_resultset = soup.find_all(attrs = {'data-testid':'pagination-page-next'})
        next_page_end_link = next_page_resultset[0]['href']
        # print("-"*20)
        # print("next page end link ", next_page_end_link)
        # print("-"*20)
        time.sleep(1)
        full_url = create_full_url(next_page_end_link)
    except:
        exit_flag = 1


https://ca.indeed.com/jobs?q=data+entry&l=toronto&start=0
advert_count before, in page loop:  <class 'int'>
advert type count begining:  <class 'int'>
success: insert into indeed_horizon (url, visited) values ('https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CtOLQuzmrh6rolX6PYAaVTUw4CxF4CsQm2uWOONXnXnsLIrssNiFGMrUWuXTsdr3eCeNk9hj07FJM88dTUCh0jFYFifjD0mWqIXPuhg0Xcsrhq5AvT4pwPQFO3HsQmLmGaNMo9F42kej3yzkrIEdYI80fwbIxlis7uzr6krbDcH0swcIXVg3SLXJ-6euOidGB73LmLSZT_hpvrb2l6F9Hiexy43j9h0Ob4QsHjm_lFAmJ3t5f3tjV0GFweBQlOrp2nmTTGYoozUaqXJxdDlaKnawTkNrLmVGvetb5SFyrHtPMSwzrQSObNF5KnSlAxVNR9E5DpOflo3YjQclTWsQH4uvPzjCRIhdBRdea9yjbMuCW2aQXHMt8PYMpiyzEf-Su8WKPCc7cmvbX6wtbkIZDoato9efLaFtYBuO-fpH7EgR6HdcGRPaGJKCs54OgHxoobAiBd06txVldPI7Oa7N1FkSsvlti0VSyuiAPje4Zy4A==&xkcb=SoBk-_M3WHvbOL2bIh0LbzkdCdPP&p=0&fvj=1&vjs=3',0);
advert type count end:  <class 'int'>
1
success: insert into indeed_horizon (url, visited) values ('https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A6nqIdTNLNESQQS_Umgb5eR_FfIWxoHCkjaHK


## Testing

In [8]:
print(type(advert_count))

<class 'int'>
