In [1]:
import pandas as pd
import bs4
import requests
import json
import pickle
import time

This notebook serves two purposes:
1. Archive job descriptions for future analysis
2. Automate job application process for jobs with inApply feature available

In [2]:
#Constants
RAW_PATH='./raw/'
PICKLE_PATH='./pickle/'
TODAY=time.strftime("%Y-%m-%d")
CSV_PATH='./csv/'
BASE_URL='https://www.linkedin.com'
SEARCH_TERM='Engineer'
TODAY_PICKLE_PATH=("%s%s_%s.pickle" % (PICKLE_PATH,SEARCH_TERM,TODAY))

In [3]:
def get_dict_from_url(url,codeID):
    """
    returns a dictionary from any data found in the first html 'code' element with given id in given url.
    requires a valid linkedIn session, sess.
    """
    resp=sess.get(url)
    soup = bs4.BeautifulSoup(resp.text,'lxml')
    target=soup.find('code',{'id':codeID})
    return json.loads(target.contents[0]) if target else {'description':''}

In [4]:
#set credentials
login={'session_key'       : 'username'
       ,'session_password' : 'password'}

In [5]:
#start linkedin Session
URL='https://www.linkedin.com/uas/login-submit'
sess=requests.session()
sess.post(URL,data=login)

<Response [200]>

In [6]:
#search jobs and capture response
searches={'Data Scientist':'https://www.linkedin.com/jobs/search/?keywords=%22Data%20Scientist%22&location=Houston%2C%20Texas'
         ,'Engineer':'https://www.linkedin.com/jobs/search/?keywords=Engineer&location=Houston%2C%20Texas&locationId=PLACES.us.10-4-0-101-12'
         }
pages=get_dict_from_url(searches[SEARCH_TERM],'decoratedJobPostingsModule')

In [7]:
try:
    data=pd.read_pickle(TODAY_PICKLE_PATH)
except:
    totalPages=len(pages['paging']['pages'])
    currPage=1
    data=pd.DataFrame()
    for page in pages['paging']['pages']:
        print("requesting %d out of %d " % (currPage, totalPages))
        pageURL=BASE_URL+page['pageUrl']
        time.sleep(0.1)
        posts=get_dict_from_url(pageURL,'decoratedJobPostingsModule')
        for post in posts['elements']:
            info=post['decoratedJobPosting']
            info=pd.io.json.json_normalize(info)        
            info['inApply']=post['isInApply']
            info['postURL']=post['viewJobCanonicalUrl']
            data=pd.concat([data,info],axis=0)
        currPage+=1
    data.to_pickle(TODAY_PICKLE_PATH)

requesting 1 out of 10 
requesting 2 out of 10 
requesting 3 out of 10 
requesting 4 out of 10 
requesting 5 out of 10 
requesting 6 out of 10 
requesting 7 out of 10 
requesting 8 out of 10 
requesting 9 out of 10 
requesting 10 out of 10 


In [8]:
#set index and filter to useful columns
data.set_index('jobPosting.id',inplace=True)
data=data[['inApply'
           ,'postURL'
           ,'companyName'
           ,'jobPosting.listDate'
           ,'jobPosting.title']]

In [9]:
data

Unnamed: 0_level_0,inApply,postURL,companyName,jobPosting.listDate,jobPosting.title
jobPosting.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
381275509,True,https://www.linkedin.com/jobs/view/381275509,"Atec, Inc.",1498689352000,Structural Engineer
390358370,False,https://www.linkedin.com/jobs/view/390358370,Angelica,1499454970000,Maintenance Engineer Department (Houston- TX)
394700972,False,https://www.linkedin.com/jobs/view/394700972,"Paramount Resources, LLC",1499951864000,Project Engineer
365982792,True,https://www.linkedin.com/jobs/view/365982792,Lennon Wright,1500308631000,DevOps Engineer
394184273,True,https://www.linkedin.com/jobs/view/394184273,Mazor Robotics,1499888183000,Field Service Engineer
392549944,True,https://www.linkedin.com/jobs/view/392549944,"Samson Controls, Inc.",1499709207000,Technical Support Engineer
390220780,False,https://www.linkedin.com/jobs/view/390220780,Proaction Careers,1497285887000,NPI TECHNICIAN (New Product Introduction Engin...
381248198,True,https://www.linkedin.com/jobs/view/381248198,KPRC,1498681810000,Broadcast System Engineer
394886720,False,https://www.linkedin.com/jobs/view/394886720,National Guard,1499644800000,12B Combat Engineer - Construction and Enginee...
396547463,False,https://www.linkedin.com/jobs/view/396547463,Noble Energy,1500146827000,Sr. Drilling Engineer (Permian Basin)


In [89]:
#fetch job descriptions
for (index,value) in data['postURL'].iteritems():
    currPicklePath=PICKLE_PATH+SEARCH_TERM+'/'+str(index)+'.pickle'
    try:
        desc=pickle.load(open(currPicklePath,'rb'))
    except:
        time.sleep(0.5)
        desc=get_dict_from_url(value,'jobDescriptionModule')
        pickle.dump(desc,open(currPicklePath,'wb'))
    data.loc[index,'description'] = desc['description']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
