The program asks a user to enter a **job title** and **location** of interest and scrapes relavant data from [a job posting website](www.indeed.com). Using NLTK, it removes stopwords and count the splited words. Then it asks a user to enter specific set of skills(separated by space) and return the sorted list of **skills** with number of occurence.

In [1]:
import requests # for accesing web page
from bs4 import BeautifulSoup # for pulling data out of html
import pandas as pd # for general working with data
# from nltk import word_tokenize # text mining / analysis
from collections import Counter
from nltk.corpus import stopwords
import re #regex

In [4]:
# A function take jobtitle and location as arguments and return correct url for web scraping purpose
def searchquery(jobtitle, location):
    title = jobtitle.replace(' ', '+')
    loc = location.replace(' ', '+')
    url = 'http://www.indeed.com/jobs?q=%22'+ title +'%22&radius=50&limit=50&l='+loc
    return url

# A function to take job list's url as an input and return dataframe with job titles and url to job post
def collect_job_list(url):
    # create empty list
    jobtitle, hreflink = [], []
    
    # get contents from the web
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # find the page number
    x = soup.findAll('div', {'id': 'searchCount'})[0].text.replace(',', '')
    pageN = int(x[x.find('of ')+3:])
    
    # iterate over page number    
    for i in range(0, pageN, 50):
        joblisturl = url + '&start=' + str(i)
        r = requests.get(joblisturl)
        soup = BeautifulSoup(r.content, 'html.parser')
        
        # iterate over each listed job post on search result to obtain job title and link
        for data in soup.findAll('a', {'data-tn-element': 'jobTitle'}):
            if 'clk?jk=' in data.get('href'):
                hreflink.append(data.get('href'))
                jobtitle.append(data.text)
    df = pd.DataFrame({'title': jobtitle, 'link': hreflink})
    return df


# convert the href link data in dataframe to proper url
def properurl(link):
    joburl = 'http://www.indeed.com/viewjob?jk=' +\
            link[link.find('clk?jk=')+len('clk?jk='):link.find('&fccid')]
    return joburl
            

# A function to take job posting's url as an input, mine text data from selected job post. 
# and return the text from the post.
def collect_job_data(joblink_list):
    jobdesc = []
    #iterate over href link in data frame
    for i in range(0, len(joblink_list)):
        joburl = properurl(joblink_list[i])
        
        #extracting text data from selected job posting        
        r = requests.get(joburl)
        soup = BeautifulSoup(r.content, 'html.parser')
        desc = ''.join(soup.findAll('td', {'class': 'snip'})[0].text)
        desc = re.sub('[^A-Za-z0-9&]+', ' ', desc)
        jobdesc.append(desc[:desc.find('ago')].replace('\n', ' ').lower())
    return jobdesc

# A function to take str as input, split the str and count the words
def countword(text):
    #removing stopwords from the data
    stop = stopwords.words('english')   
    
    nostopword = ' '.join([word for word in text.split() if word not in stop])
    #create word count list
    count = Counter(nostopword.split())
    return count

# A function to take words as input and return the list of counts for the words of interest.
def sortlist(words, countlist):
    result= []  
    for word in words.split():
        result.append([x for x in countlist if word in x])
    return result

In [5]:
# prompt a user to input job title and location of interest
jobtitle = input('Please enter the "job title" of interest:  ') 
location = input('Please enter the "location" of interest:  ')

# using the 'collect_job_list' funciton, assign data frame to a variable 'df' 
df = collect_job_list(searchquery(jobtitle, location))

# remove duplicated data
df = df[df.duplicated('link') == False]

# add column of text that contains job description to the dataframe
df['text'] = collect_job_data(list(df.link))

# Clean data with text whose length is less than 100
for index, row in df.iterrows():
    if(len(row.text) < 100):
        row.text = ''
df = df[df.text != '']

# iterate over the dataframe to split the texts into words
for index, row in df.iterrows():
    row.text = countword(row.text)

# count the occurence of words
totalcount = Counter()
for index, row in df.iterrows():
    totalcount += row.text

# reorder the list of count in descending order
result = totalcount.most_common()

# overview of the most frequent words
print("Number of open position: "+str(len(df)))
print("\nThe 100 most frequent words are as below:\n")
print(result[:100])

Please enter the "job title" of interest:  data analyst
Please enter the "location" of interest:  california
Number of open position: 513

The 100 most frequent words are as below:

[('data', 4430), ('experience', 1909), ('business', 1504), ('skills', 1046), ('work', 1034), ('team', 920), ('analysis', 897), ('ability', 866), ('analyst', 768), ('management', 658), ('support', 595), ('strong', 568), ('knowledge', 549), ('years', 540), ('reporting', 538), ('analytics', 527), ('information', 515), ('sql', 507), ('required', 504), ('requirements', 497), ('days', 480), ('tools', 479), ('working', 456), ('development', 446), ('reports', 429), ('product', 427), ('including', 422), ('quality', 403), ('related', 401), ('systems', 390), ('technical', 388), ('position', 377), ('develop', 373), ('environment', 369), ('customer', 358), ('degree', 358), ('job', 350), ('preferred', 339), ('new', 337), ('provide', 335), ('&', 334), ('company', 332), ('analytical', 331), ('using', 327), ('communication'

In [6]:
# prompt a user to sort the list with words of interest
intwords = input('Enter the words of interst separated by space\n')
sortlist(intwords, result)

Enter the words of interst separated by space
sql r python hadoop excel


[[('sql', 507)],
 [('r', 104)],
 [('python', 110)],
 [('hadoop', 57)],
 [('excel', 289)]]