In [1]:
from bs4 import BeautifulSoup 
from urllib.request import urlopen 
import re 
from time import sleep 
from collections import Counter 
from nltk.corpus import stopwords 
import pandas as pd
%matplotlib inline

In [2]:
def page_cleaner(soup_obj):
    '''
    Inputs: a BeautifulSoup object to investigate
    Outputs: Cleaned text only
    '''
    if len(soup_obj) == 0:
        soup_obj = BeautifulSoup(page, 'html5lib')
    
    
    for script in soup_obj(["script", "style"]):
        script.extract()

    text = soup_obj.get_text()
    lines = (line.strip() for line in text.splitlines()) 
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) 
    text = ''.join(chunk for chunk in chunks if chunk)
    
    text = re.sub("[^a-zA-Z+3]"," ", text)  
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) 
    
    return text

In [3]:
def prep_url(role, location):
    location = location.replace(' ','+')
    role = role.replace(' ','+')
    return 'https://www.indeed.com/jobs?q='+role+'&l='+location+'&sort=date'

In [4]:
def get_job_urls(search_url):
    base_url = 'https://www.indeed.com'
    search = urlopen(search_url)
    soup = BeautifulSoup(search)
    num_jobs_area = soup.find(id = 'searchCount').text
    job_numbers = re.findall('\d+', num_jobs_area)
    if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
        total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    else:
        total_num_jobs = int(job_numbers[2])
    num_pages = total_num_jobs/10 

    job_url = [] 
    for i in range(1,int(num_pages+1)): 
        start_num = str(i*10)
        current_page = ''.join([search_url, '&start=', start_num])
        
        html_page = urlopen(current_page).read()
        page_obj = BeautifulSoup(html_page)
        job_link_area = page_obj.find(id = 'resultsCol')
        for a in [link.find_all('a') for link in job_link_area.find_all('div') if link.get('class') == ['title']]:
            job_url.append(base_url+a[0].get('href'))
    return job_url

In [39]:
def scraper(jobs):
    try:
        data = []
        for job_url in jobs:
            job = {}
            page = urlopen(job_url)
            bs_obj = BeautifulSoup(page)
            #print(job_url)
            #print('----------------------------------------------------------------------------------------------------')
            if '-' not in bs_obj.find('title').text:
                continue
            try:
                job['company'] = bs_obj.find('div', class_ = 'icl-u-lg-mr--sm icl-u-xs-mr--xs').text
            except:
                job['company'] = bs_obj.find('div', class_ = 'icl-u-xs-mt--xs icl-u-textColor--secondary').text
            title = bs_obj.find('title').text
            job['position'] = title.split(' - ')[-3]
            if ',' in title.split(' - ')[-2]:
                job['city'] = title.split(' - ')[-2].split(', ')[0]
                job['state'] = title.split(' - ')[-2].split(', ')[1]
            else:
                job['city'] = '-'
                job['state'] =  '-'           
            job['jd'] = page_cleaner(bs_obj)
            job['url'] = job_url
            data.append(job)
        return data
    except:
        return 'fail'

In [32]:
def prep_df(pos, loc):
    url = prep_url(pos, loc)
    jobs = get_job_urls(url)
    data = scraper(jobs)
    jobs_df = pd.DataFrame(data)
    jobs_df['state'] = jobs_df['state'].str.split().str[0]
    return jobs_df

In [41]:
#machine_learning = prep_df('machine learning', 'USA')
#data_analyst = prep_df('data analyst', 'USA')
#data_scientist = prep_df('data scientist', 'USA')
#data_engineer = prep_df('data engineer', 'USA')

In [43]:
machine_learning.to_csv('machine_learning_jobs.csv',index=False)
data_analyst.to_csv('data_analyst.csv',index=False)
data_scientist.to_csv('data_scientist.csv',index=False)
data_engineer.to_csv('data_engineer.csv',index=False)

In [46]:
def display_top_state(df):
    print(df.groupby('state')['jd'].count().reset_index().sort_values('jd',ascending=False))
    

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.85, stop_words=stop_words)

In [None]:
jd = jobs_df['jd'].tolist()

In [None]:
word_count_vector = cv.fit_transform(jd)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [None]:
df_test = jobs_df

In [None]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("<!--?.*?-->","",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text
#df_test=pd.read_json("data/stackoverflow-test.json",lines=True)
#df_test['text'] = df_test['title'] + df_test['body']
df_test['jd'] =df_test['jd'].apply(lambda x:pre_process(x))
 
# get test docs into a list
docs_test=df_test['jd'].tolist()

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
feature_names=cv.get_feature_names()
 
# get the document that we want to extract keywords from
doc=docs_test[0]
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
doc = ' '.join(docs_test)
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
sorted_items=sort_coo(tf_idf_vector.tocoo())
temp=extract_topn_from_vector(feature_names,sorted_items,10)

In [10]:
doc

NameError: name 'doc' is not defined