In [10]:
from bs4 import BeautifulSoup
from urllib2 import urlopen
import feedparser
from collections import Counter
import nltk
import re
import pdb
import pandas as pd
import numpy as np

In [2]:
#method accepts a BeautifulSoup parsed HTML object and returns a list of <a> tags of job posts
def get_job_posts(soup_object):
    
    job_posts = []
    a_s = soup_object.find_all('a')
    for a in a_s:
        try:
            if u'jobtitle' in a['class']:
                job_posts.append(a)
        except:
            pass
    
    return job_posts

In [16]:
def replace_spaces(text):
    
    replace_list = ['machine learning', 'neural network', 'decision tree', 'graph database'
               'supervised learning', 'unsupervised learning', 'reinforcement learning', 
               'logistic regression', 'linear regression', 'naive bayes', 'random forest', 
               'deep learning', 'support_vector_machines', 'advanced degree', 'computer science'
               'ph d', 'm sc', 'b sc', 'b a' 'feature selection', 'natural language', 'sql server'
                'operations research', 'voted best place to work', 'cloud computing', 'century link']
    
    for word in replace_list:
        text = text.replace(word, word.replace(' ', '_'))
        
    return text

In [20]:
#method accepts a link to a job post and returns a list of 
def get_words(job_post_href):
    
    #only keep the content
    try:
        post_soup = BeautifulSoup(feedparser.parse(job_post_href)['feed']['summary'], 'html.parser')
    except:
        return []
    
    text = post_soup.get_text().lower()
    
    # keep only letters, but keep '+' for 'c++', '#' for 'c#','3' for 'd3,
    # '2' for 'db2', and  '-' for 'scikit-learn'
    text = re.sub("[^a-z+#32-]"," ", text)
        
    #replace spaces in between important phrases with underscores so they don't get seperated as different words
    text = replace_spaces(text)
    
    
    
    #break into lines to get rid of the annoying '\n' characters. Also project to lowercase.
    lines = [line.strip().lower() for line in text.splitlines()]

    words = []
    for line in lines:
        words += [each_word for each_word in line.split()]
        
    #Lighten the load by getting rid of basic stopwords, like "the", "or" etc    
    words = set(words) - set(nltk.corpus.stopwords.words("english")) 
    
    return words
    

In [18]:
#method accepts a list of words in a job post, and creates sets according to pre-determined sets
def words_by_category(job_posts):
    
    buzz_words = set(['big_data', 'terabytes', 'petabytes', 'voted_best_place_to_work'])
    
    languages = set(['r', 'python', 'java', 'scala', 'c', 'c++', 'c#', 'c--', 
                               'f', 'f#', 'groovy', 'julia', 'jscript', 
                               'matlab', 'perl', 'javascript', 'php', 'swift', 
                               'coffee', 'sql','psql', 'tsql',  'mathematica', 'wolfram', 'pascal'])#want to include 'go!' once find out how to allow '!'
    
    frameworks = set(['angular',  'angularjs', 'typescript' ,'backbone', 'underscore' ,
                      'asp', 'node', 'django', 'flask' ])
    
    hadoop_technologies = set(['hadoop', 'pig', 'mapreduce', 'spark', 'hive', 'flume', 
                              'shark', 'zookeeper', 'mahout' ])

    #operating systems and tools
    operating_system_tools = set(['unix', 'osx', 'bash', 'batch' , 'curl', 'linux', 'windows', 'git' ])
    
    cloud_computing = set(['cloud_computing', 'aws', 'azure', 'cloudera', 'chef', 'joyent', 'rackspace', 
                           'century_link' ])
        
    machine_learning_algorithms = set(['decision_trees', 'clustering', 'regression', 'anova', 
                                       'k-means', 'neural_networks', 'supervised learning', 'unsupervised_learning',
                                       'reinforcement_learning', 'logistic_regression', 'linear_regression', 'naive_bayes', 'random_forest', 
                                       'deep_learning', 'pca', 'shrinkage', 'regression', 'support_vector_machines', 'svm', 
                                      'boosted', 'feature_selection', 'natural_language', 'nlp', 'cart', ])
    
    analysis_tools = set(['excel', 'sas', 'spss', 'sas', 'tableau', 'visual_basic' ])
    
    relational_databases = set(['redshift', 'postgressql', 'mysql', 'oracle', 'db2', 'h2', 
                               'sqlbase', 'libreoffice', 'netezza', 'azure', 'firebird', 'sql_server'])
    
    nosql_technologies = set(['nosql', 'hbase', 'cloudera', 'cassandra', 'scylla', 'mongodb', 'sonarw', 'jsonar', 
                             'elassandra', 'couchdb', 'rethinkdb', 'dynamodb', 'arangodb'])
    
    graph_databases = set(['graph_databases', 'graph_database', 'graph_databases', 'neo4j', 'arangodb', 'orientdb', 'graphbase', 'trinity'])
    
    academic_degrees = set(['bachelor', 'master','doctor', 'doctorate', 'ph_d', 'phd', 
                            'm_sc', 'b_a', 'b_sc', 'advanced_degree', 'mba'])
    
    academic_disciplines = set(['mathematics', 'statistics', 'computer_science', 'business', 'stem', 
                               'operations_research'])
    
    
    #for each job post see which skills appear. Leave as python.set data strucuture for effecient set operations later
    for post in job_posts:
        post['buzz_words'] = set(post['words']) & buzz_words
        post['languages'] = set(post['words']) & languages
        post['frameworks'] = set(post['words']) & frameworks
        post['hadoop_technologies'] = set(post['words']) & hadoop_technologies
        post['operating_system_tools'] = set(post['words']) & operating_system_tools
        post['machine_learning_algorithms'] = set(post['words']) & machine_learning_algorithms
        post['analysis_tools'] = set(post['words']) & analysis_tools
        post['relational_databases'] = set(post['words']) & relational_databases
        post['nosql_technologies'] = set(post['words']) & nosql_technologies
        post['graph_databases'] = set(post['words']) & graph_databases
        post['academic_degrees'] = set(post['words']) & academic_degrees
        post['academic_disciplines'] = set(post['words']) & academic_disciplines
        post['academic_degrees'] = set(post['words']) & academic_degrees
        
        post['all_skills'] = buzz_words.union(languages).union(frameworks).union(
            hadoop_technologies).union(operating_system_tools).union(machine_learning_algorithms).union(
            analysis_tools).union(relational_databases).union(nosql_technologies).union(graph_databases).union(
            academic_degrees).union(academic_disciplines) & set(post['words'])
        
        

In [6]:
def crawl_indeed(query, city = "", experience_level = "", num_pages = 10):
    
    #clean parameters so that they match Indeed's protocol
    
    #spaces parameters seperated by '+'
    query = query.replace(' ', '+')
    city = city.replace(' ' , '+')
    
    #make sure experience_level is one of the four valid options
    if experience_level not in ['', 'entry_level', 'mid_level', 'senior_level']:
        experience_level = ''
        print "Experience level parameter not valid. Showing all experience levels"
    
    #build data in JSON-like format, given the heirarchial nature of the data
    data = list()
    
    #Indeed shows job posts 10 at a time, so each page starts with post 0, 10, 20...
    #Loop through the posts using the num_pages parameter
    
    page_start_numbers = np.arange(num_pages)*10
    
    for start_number in page_start_numbers:
        
        url = "http://www.indeed.com/jobs?q={0}&l={1}&explvl={2}&start={3}".format(query, city, experience_level, start_number)
    
        #isolate html body and get rid of extraneous HTML objects using feedparser, and create a BeautifulSoup obect
        fp = feedparser.parse(url)
        try:
            page_soup = BeautifulSoup(fp['feed']['summary'])
        except:
            pass
    
        #get a list of all the <a class='jobtitle'></a> elemdents, which are job posts
        job_posts = get_job_posts(page_soup)

        for post in job_posts:
            
            #each job post gets a sub-dictionary
            post_data = dict()
            
            #attributes which are constant amongst all results for each execution
            post_data['query'] = query
            post_data['city'] = city
            
            #job title of that sepcific post
            post_data['title'] = post['title']
            
            
            #for each job post, extract the link to the post itself. Store in data dictionary for later
            post_href = post['href']
            post_data['href'] = post_href
            
            #extract all the (cleaned up) words fromt that link
            post_words = get_words(post_href)
                        
            post_data['words'] =post_words
            data.append(post_data)
            
    words_by_category(data)
    return data

In [24]:
data_scientist_nyc = pd.DataFrame(crawl_indeed('data scientist', 'new york', num_pages = 15))
data_scientist_sanfran = pd.DataFrame(crawl_indeed('data scientist', 'san fransisco', num_pages = 15))
data_scientist_chicago = pd.DataFrame(crawl_indeed('data scientist', 'san fransisco', num_pages = 15))

In [25]:
data_scientist_all = pd.DataFrame(crawl_indeed('data scientist', num_pages = 5))
data_analyst_all = pd.DataFrame(crawl_indeed('data analyst', num_pages = 15))
data_engineer_all = pd.DataFrame(crawl_indeed('data engineer', num_pages = 15))
machine_learning_engineer_all = pd.DataFrame(crawl_indeed('machine learning engineer', num_pages = 15))

In [26]:
data_scientist_nyc

Unnamed: 0,academic_degrees,academic_disciplines,all_skills,analysis_tools,buzz_words,city,frameworks,graph_databases,hadoop_technologies,href,languages,machine_learning_algorithms,nosql_technologies,operating_system_tools,query,relational_databases,title,words
0,{},"{mathematics, statistics}","{statistics, tableau, terabytes, r, sql, mathe...",{tableau},{terabytes},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"{r, sql}",{},{},{},data+scientist,{},Data Scientist,"{shop, breadth, atmosphere, foreclosure, requi..."
1,{advanced_degree},"{mathematics, statistics, business}","{natural_language, statistics, advanced_degree...",{},{},new+york,{},{},{spark},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"{python, r, matlab, scala}",{natural_language},{},{},data+scientist,{},Machine Learning Quantitative Analyst,"{demo, connecttwitterfacebooklinkedinyoutubeab..."
2,{phd},{},"{r, phd}",{},{},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,{r},{},{},{},data+scientist,{},"Principal Scientist, Analytical Services","{feedback, chain, enter, able, dissolution, re..."
3,{},{business},"{cart, business}",{},{},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,{},{cart},{},{},data+scientist,{},"Specialist, Data Analytics","{opinions, abilities, help, applicationinterns..."
4,{bachelor},{},"{python, bachelor, excel, tableau}","{excel, tableau}",{},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,{python},{},{},{},data+scientist,{},Data Scientist/Analyst – Insights & Consulting,"{ny, listening, able, focus, rest, years, sour..."
5,{},"{mathematics, statistics}","{statistics, tableau, terabytes, r, sql, mathe...",{tableau},{terabytes},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"{r, sql}",{},{},{},data+scientist,{},Data Scientist,"{shop, breadth, atmosphere, foreclosure, requi..."
6,{advanced_degree},"{mathematics, statistics, business}","{natural_language, statistics, advanced_degree...",{},{},new+york,{},{},{spark},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"{python, r, matlab, scala}",{natural_language},{},{},data+scientist,{},Machine Learning Quantitative Analyst,"{demo, connecttwitterfacebooklinkedinyoutubeab..."
7,{phd},{},"{r, phd}",{},{},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,{r},{},{},{},data+scientist,{},"Principal Scientist, Analytical Services","{feedback, chain, enter, able, dissolution, re..."
8,{},{business},"{cart, business}",{},{},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,{},{cart},{},{},data+scientist,{},"Specialist, Data Analytics","{opinions, abilities, help, applicationinterns..."
9,{bachelor},{},"{python, bachelor, excel, tableau}","{excel, tableau}",{},new+york,{},{},{},http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,{python},{},{},{},data+scientist,{},Data Scientist/Analyst – Insights & Consulting,"{ny, listening, able, focus, rest, years, sour..."


In [None]:
def get_skill_frequency(df, skill = 'all_skills'): 
    
    counter = Counter()
    for index, job_post in df.iterrows():
        
        
    

In [188]:
pd.DataFrame(data2).iloc[9].index

Index([u'city', u'high_level_languages', u'low_level_languages', u'query',
       u'title', u'words'],
      dtype='object')