In [39]:
from bs4 import BeautifulSoup
from urllib2 import urlopen
import feedparser
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import pdb

import numpy as np

In [39]:
FEED_URL='http://www.indeed.com/jobs?q=data+scientist&l=New+York%2C+NY' 
fp = feedparser.parse(FEED_URL)


In [109]:
fp = feedparser.parse(FEED_URL)
soup = BeautifulSoup(fp['feed']['summary'])

In [110]:
#method accepts a BeautifulSoup parsed HTML object and returns a list of <a> tags of job posts
def get_job_posts(soup_object):
    
    job_posts = []
    a_s = soup_object.find_all('a')
    for a in a_s:
        try:
            if u'jobtitle' in a['class']:
                job_posts.append(a)
        except:
            pass
    
    return job_posts

In [129]:
#method accepts a link to a job post and returns a list of 
def get_words(job_post_href):
    
    #only keep the content
    post_soup = BeautifulSoup(feedparser.parse(job_post_href)['feed']['summary'], 'html.parser')

    text = post_soup.get_text().lower()
    
    #keep only letters, but keep '+' for 'c++', '#' for 'c#','3' for 'd3, '!' for 'go!', and  '-' for 'scikit-learn'
    text = re.sub("[^a-z+#3-]"," ", text) 

    #break into lines to get rid of the annoying '\n' characters. Also project to lowercase.
    lines = [line.strip().lower() for line in text.splitlines()]

    words = []
    for line in lines:
        words += [each_word for each_word in line.split()]
        
    #Lighten the load by getting rid of basic stopwords, like "the", "or" etc    
    words = set(words) - set(nltk.corpus.stopwords.words("english")) 
    
    

In [112]:
#method accepts a list of words in a job post, and creates sets according to pre-determined sets
def words_by_category(words_list):
    
    high_level_languages = set(['r', 'python', 'java', 'scala', 'c', 'c++', 'c#', 'c--', 
                               'f', 'f#', 'go', 'go!', 'groovy', 'julia', 'jscript', 
                               'matlab', 'perl'])
    
    low_level_langages = set(['pascal', 'haskell', 'fortran', 'django', 'flask'])
    
    frameworks = set(['angular', 'angularjs', 'asp', 'node' ])
    
    #operating systems and tools
    operating_systems = set(['unix', 'osx', 'bash', 'batch' , 'curl' ])
    
    misc_tools = set(['latex' ])

In [134]:
def crawl_indeed(query, city, experience_level = "", num_pages = 10):
    
    #clean parameters so that they match Indeed's protocol
    
    #spaces parameters seperated by '+'
    query = query.replace(' ', '+')
    city = city.replace(' ' , '+')
    
    #make sure experience_level is one of the four valid options
    if experience_level not in ['', 'entry_level', 'mid_level', 'senior_level']:
        experience_level = ''
        print "Experience level parameter not valid. Showing all experience levels"
    
    #build data in JSON-like format, given the heirarchial nature of the data
    data = list()
    
    #Indeed shows job posts 10 at a time, so each page starts with post 0, 10, 20...
    #Loop through the posts using the num_pages parameter
    
    page_start_numbers = np.arange(num_pages)*10
    
    for start_number in page_start_numbers:
        
        url = "http://www.indeed.com/jobs?q={0}&l={1}&explvl={2}&start={3}".format(query, city, experience_level, start_number)
    
        #isolate html body and get rid of extraneous HTML objects using feedparser, and create a BeautifulSoup obect
        fp = feedparser.parse(url)
        page_soup = BeautifulSoup(fp['feed']['summary'])
    
        #get a list of all the <a class='jobtitle'></a> elemdents, which are job posts
        job_posts = get_job_posts(page_soup)

        for post in job_posts:
            
            #each job post gets a sub-dictionary
            post_data = dict()
            
            #attributes which are constant amongst all results
            post_data['query'] = query
            post_data['city'] = city
            
            #job title of that sepcific post
            post_data['title'] = post['title']
            
            
            #for each job post, extract the link to the post itself
            post_href = post['href']
            
            #extract all the (cleaned up) words fromt that link
            post_words = get_words(post_href)
            
            pdb.set_trace()
            
            post_data['words'] =post_words
            data.append(post_data)
            
            
        pdb.set_trace()
    #http://www.indeed.com/jobs?q=data+scientist&l=New+York,+NY&explvl=entry_level
    #http://www.indeed.com/jobs?q=data+scientist&l=New+York,+NY&explvl=senior_level&start=0
    

In [135]:
crawl_indeed('data scientist', 'new york')

> <ipython-input-129-4d003a18fd7c>(6)get_words()
-> post_soup = BeautifulSoup(feedparser.parse(job_post_href)['feed']['summary'], 'html.parser')
(Pdb) c
> <ipython-input-134-2d287d37d927>(54)crawl_indeed()
-> post_data['words'] =post_words
(Pdb) c
> <ipython-input-129-4d003a18fd7c>(6)get_words()
-> post_soup = BeautifulSoup(feedparser.parse(job_post_href)['feed']['summary'], 'html.parser')
(Pdb) c
> <ipython-input-134-2d287d37d927>(52)crawl_indeed()
-> pdb.set_trace()
(Pdb) data
[{'query': 'data+scientist', 'title': u'Machine Learning Quantitative Analyst', 'words': None, 'city': 'new+york'}]
(Pdb) q


BdbQuit: 

In [84]:
a = set([1,4,2,6])
b = set([3,5,4,2,6]

In [92]:
a and b

{2, 3, 4, 5, 6}

In [13]:
job_post_href = 'https://careers.bloomberg.com/job/detail/54015?utm_source=Indeed&utm_campaign=Bloomberg_Indeed&sponsored=ppc&utm_campaign=SalesUS'
feedparser.parse(job_post_href)

post_soup = BeautifulSoup(feedparser.parse(job_post_href)['feed']['summary'], 'html.parser')
