In [60]:
import re
import bs4
import time
import requests
import smtplib


def evaluate_job(job_url):
    try:
        job_html = requests.request('GET', job_url, timeout = 10)
    except:
        return 0
    
    job_soup = bs4.BeautifulSoup(job_html.content, 'lxml')
    soup_body = job_soup('body')[0]
    
    python_count = soup_body.text.count('Python') + soup_body.text.count('python')
    sql_count = soup_body.text.count('SQL') + soup_body.text.count('sql')
    ml_count = soup_body.text.count('Machine Learning') + soup_body.text.count('machine learning')
    research_count = soup_body.text.count('Research') + soup_body.text.count('research')
    skill_count = python_count + sql_count + ml_count + research_count
    print ('ML count: {0}, Python count: {1}, SQL count: {2}, Research count: {3}'.format(ml_count, python_count, sql_count, research_count))
    
    return skill_count

In [14]:
evaluate_job('https://uk-amazon.icims.com/jobs/586007/applied-scientist/job?iis=Indeed&iisn=Indeed+%28Free+Posting%29&utm_source=indeed.com&utm_campaign=cv+organic&utm_medium=job_aggregator&utm_content=job_posting&ss=paid&mobile=false&width=1266&height=1200&bga=true&needsRedirect=false&jan1offset=-480&jun1offset=-420')


ML count: 0, Python count: 0, SQL count: 0


0

In [61]:
def extract_job_data_from_indeed(base_url):
    response = requests.get(base_url)
    soup = bs4.BeautifulSoup(response.content, 'lxml')
    
    tags = soup.find_all('div', {'data-tn-component' : "organicJob"})
    companies_list = [x.span.text for x in tags]
    attrs_list = [x.h2.a.attrs for x in tags]
    dates = [x.find_all('span', {'class':'date'}) for x in tags]
    
    # update attributes dictionaries with company name and date posted
    [attrs_list[i].update({'company': companies_list[i].strip()}) for i, x in enumerate(attrs_list)]
    [attrs_list[i].update({'date posted': dates[i][0].text.strip()}) for i, x in enumerate(attrs_list)]
    return attrs_list

In [16]:
extract_job_data_from_indeed('https://www.indeed.co.uk/jobs?q=machine+learning&l=United+Kingdom&sort=date')[0]


{'class': ['turnstileLink'],
 'company': 'Sidetrade',
 'data-tn-element': 'jobTitle',
 'date posted': 'Just posted',
 'href': '/rc/clk?jk=23a632661b690116&fccid=0bae397722a83c86',
 'itemprop': 'title',
 'onclick': 'setRefineByCookie([]); return rclk(this,jobmap[0],true,0);',
 'onmousedown': 'return rclk(this,jobmap[0],0);',
 'rel': ['noopener', 'nofollow'],
 'target': '_blank',
 'title': 'Data Engineer'}

In [62]:
extra_interest_companies = ['apple', 'microsoft', 'google', 'facebook', 'amazon']

In [68]:
def find_new_jobs_ml(days_ago_limit = 1, starting_page = 0, pages_limit = 10, old_jobs_limit = 5,
                  location = 'United Kingdom', query = 'machine learning'):
    
    query_formatted = re.sub(' ', '+', query)
    location_formatted = re.sub(' ', '+', location)
    indeed_url = 'http://www.indeed.co.uk/jobs?q={0}&l={1}&sort=date&start='.format(query_formatted, location_formatted)
    old_jobs_counter = 0
    new_jobs_list = []
    
    for i in range(starting_page, starting_page + pages_limit):
        if old_jobs_counter >= old_jobs_limit:      
            break
        
        print ('URL: {0}'.format(indeed_url + str(i*10)), '\n')

        # extract job data from Indeed page
        attrs_list = extract_job_data_from_indeed(indeed_url + str(i*10))
        
        # loop through each job, breaking out if we're past the old jobs limit
        for j in range(0, len(attrs_list)-1): 
            if old_jobs_counter >= old_jobs_limit:
                break

            href = attrs_list[j]['href']
            title = attrs_list[j]['title']
            company = attrs_list[j]['company']
            date_posted = attrs_list[j]['date posted']
            
            # if posting date is beyond the limit, add to the counter and skip
            try:
                if int(date_posted[0]) >= days_ago_limit:
                    print ('Adding to old_jobs_counter.')
                    old_jobs_counter += 1
                    continue
            except:
                pass

            print ('{0}, {1}, {2}'.format(repr(company), repr(title), repr(date_posted)))

            # evaluate the job
            evaluation = evaluate_job('http://indeed.co.uk' + href)
            
            if evaluation >= 1 or company.lower() in extra_interest_companies:
                new_jobs_list.append('{0}, {1}, {2}'.format(company, title, 'http://indeed.co.uk' + href))
                
            print ('\n')
            time.sleep(15)
            
    new_jobs_string = '\n\n'.join(new_jobs_list)
    return new_jobs_string

In [69]:
def find_new_jobs_ds(days_ago_limit = 1, starting_page = 0, pages_limit = 10, old_jobs_limit = 5,
                  location = 'United Kingdom', query = 'data scientist'):
    
    query_formatted = re.sub(' ', '+', query)
    location_formatted = re.sub(' ', '+', location)
    indeed_url = 'http://www.indeed.co.uk/jobs?q={0}&l={1}&sort=date&start='.format(query_formatted, location_formatted)
    old_jobs_counter = 0
    new_jobs_list = []
    
    for i in range(starting_page, starting_page + pages_limit):
        if old_jobs_counter >= old_jobs_limit:      
            break
        
        print ('URL: {0}'.format(indeed_url + str(i*10)), '\n')

        # extract job data from Indeed page
        attrs_list = extract_job_data_from_indeed(indeed_url + str(i*10))
        
        # loop through each job, breaking out if we're past the old jobs limit
        for j in range(0, len(attrs_list)-1): 
            if old_jobs_counter >= old_jobs_limit:
                break

            href = attrs_list[j]['href']
            title = attrs_list[j]['title']
            company = attrs_list[j]['company']
            date_posted = attrs_list[j]['date posted']
            
            # if posting date is beyond the limit, add to the counter and skip
            try:
                if int(date_posted[0]) >= days_ago_limit:
                    print ('Adding to old_jobs_counter.')
                    old_jobs_counter += 1
                    continue
            except:
                pass

            print ('{0}, {1}, {2}'.format(repr(company), repr(title), repr(date_posted)))

            # evaluate the job
            evaluation = evaluate_job('http://indeed.co.uk' + href)
            
            if evaluation >= 1 or company.lower() in extra_interest_companies:
                new_jobs_list.append('{0}, {1}, {2}'.format(company, title, 'http://indeed.co.uk' + href))
                
            print ('\n')
            time.sleep(15)
            
    new_jobs_string = '\n\n'.join(new_jobs_list)
    return new_jobs_string

In [38]:
links = find_new_jobs('https://www.indeed.co.uk/jobs?q=machine+learning&l=United+Kingdom&sort=date')[:-1]

URL: http://www.indeed.co.uk/jobs?q=machine+learning&l=United+Kingdom&sort=date&start=0 

'EY', 'EMEIA Tax Analytics - Data Analytics opportunities', 'Just posted'
ML count: 1, Python count: 1, SQL count: 0


'Jet2.Com Limited', 'Junior IT Operations Engineer', 'Just posted'
ML count: 0, Python count: 0, SQL count: 0


'Emailage', 'Fraud Manager- London', 'Just posted'
ML count: 0, Python count: 0, SQL count: 0


'St George’s University Hospitals', 'Specialist Biomedical Scientist - Haematology', 'Just posted'
ML count: 0, Python count: 0, SQL count: 0


'Harnham', 'DATA SCIENTIST (TECH START-UP)', 'Just posted'
ML count: 6, Python count: 2, SQL count: 1


'Harnham', 'DATA SCIENTIST - (MULTIPLE SENIOR ROLES AVAILABLE)', 'Just posted'
ML count: 6, Python count: 2, SQL count: 2


'LARK & LARKS LTD', 'Engineering Apprentices', 'Just posted'
ML count: 0, Python count: 0, SQL count: 0


'Harnham', 'DATA SCIENTIST (HEAVY MACHINE LEARNING FOCUS)', 'Just posted'
ML count: 8, Python count: 2, S

KeyboardInterrupt: 

In [70]:
def send_gmail(from_addr = 'Solomon Amos <solomonlazio@gmail.com>', to_addr = 'solomonlazio@gmail.com',
               location = 'United Kingdom',
               subject = 'Daily Data Science and Machine Learning Jobs Update Scraped from Indeed', text = None):
    
    message = 'Subject: {0}\n\nJobs in: {1}\n\n{2}'.format(subject, location, text)

    # login information
    username = 'solomonlazio@gmail.com'
    password = 'Lazio205!'
    
    # send the message
    server = smtplib.SMTP('smtp.gmail.com:587')
    server.ehlo()
    server.starttls()
    server.login(username, password)
    server.sendmail(from_addr, to_addr, message)
    server.quit()
    print ('Email sent.')

In [71]:
def main():
    print ('Scraping Indeed now.')

    start_page = 0
    page_limit = 2
    location = 'United Kingdom'
    machine_learning_jobs = find_new_jobs_ml(query = 'machine learning', starting_page = start_page,
                                        location = location, pages_limit = page_limit, days_ago_limit = 1, old_jobs_limit = 5)
    send_gmail(text = machine_learning_jobs, location = location)
    
    data_scientist_jobs = find_new_jobs_ds(query = 'data scientist', starting_page = start_page,
                                        location = location, pages_limit = page_limit, days_ago_limit = 1, old_jobs_limit = 5)
    send_gmail(text = data_scientist_jobs, location = location)

In [72]:
if __name__ == "__main__":
    main()

Scraping Indeed now.
URL: http://www.indeed.co.uk/jobs?q=machine+learning&l=United+Kingdom&sort=date&start=0 

'Aston University', 'Machine Learning for Graphs', 'Just posted'
ML count: 4, Python count: 0, SQL count: 0, Research count: 5


'Digitek Resourcing Ltd', 'Data Scientist (Machine Learning) | £80,000 - £90,000 + Equity', 'Just posted'
ML count: 0, Python count: 0, SQL count: 0, Research count: 0


'Viavi Solutions', 'Staff Research Data Scientist', 'Just posted'
ML count: 2, Python count: 2, SQL count: 0, Research count: 13


'Oliver Bernard', 'DevOps, AWS, Machine Learning', 'Just posted'
ML count: 0, Python count: 0, SQL count: 0, Research count: 0


'Defence Science and Technology Laboratory', 'Data Scientists', 'Just posted'
ML count: 1, Python count: 0, SQL count: 0, Research count: 2


'Wittin', 'Junior Python Developer', 'Just posted'
ML count: 3, Python count: 4, SQL count: 2, Research count: 1


'Aston University', 'Models and Tools for Advanced Big Data Management', 