In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import string
from nltk import tokenize
from selenium import webdriver
import os
from textblob import TextBlob
from google_trans_new import google_translator
translator = google_translator()

In [2]:
def generate_link(job, location):
    pj = job.lower().split()
    l = location.lower()
    return f'https://nl.indeed.com/jobs?q={pj[0]}+{pj[1]}&l={l}'

In [3]:
def extract_jobs(url, page):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
    URL = f'{url}&start={page}'
    r = requests.get(URL, headers)
    soup = BeautifulSoup(r.content, "html.parser")
    return soup

In [4]:
def extract_descriptions(link):
    soup = extract_jobs(link, 0)
    text = soup.find('div', class_ = 'jobsearch-JobComponent-description icl-u-xs-mt--md')
    return text

In [5]:
def transform(soup, joblist):
    base = 'https://nl.indeed.com'
    divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    for item in divs:
        title = item.find('a').text.strip()
        company = item.find('span', class_ = 'company').text.strip()
        link = base + item.find('a').get('href')
        try:
            description = extract_descriptions(link)
        except:
            description = None
        # make placeholder for the id
        iden = '_'
        job = {
            'id': iden,
            'title': title,
            'company': company,
            'link': link,
            'description': description
        }
        joblist.append(job)
    return joblist

In [6]:
def job_to_joblist(job, location, n_pages):
    joblist = []
    # reformated to indeed page format 
    n = (n_pages*10)-10
    link = generate_link(job, location)
    # loop over pages and extract
    for i in range(0, n, 10):
        jobs = extract_jobs(link, i)
        transform(jobs, joblist)
    return joblist, job

In [7]:
def add_id(df, job):
    rangelist = list(range(len(df)))
    if job.split()[1][:3] == 'ste':
        prefix = 'DSP'
    elif job.split()[1][:3] == 'sci':
        prefix = 'DS'
    else:
        prefix = 'DOC'
    idlist = [prefix + '_' + str(num) for num in rangelist]
    df['id'] = idlist
    return df

In [8]:
def joblist_to_dataframe(joblist, job):
    df = pd.DataFrame(joblist)
    path = os.path.abspath(os.getcwd())
    df = df.drop_duplicates(subset=['title', 'company'], keep='first').reset_index(drop=True)
    df.dropna(inplace= True)
    df.reset_index(drop=True, inplace=True)
    df = add_id(df, job)
    df.to_csv (f'{path}\dataframe_{job}.csv', index = False, header=True)
    return df

In [9]:
science_list, job = job_to_joblist('data steward', 'amsterdam', 3)

In [10]:
df = joblist_to_dataframe(science_list, job)

In [11]:
df

Unnamed: 0,id,title,company,link,description
0,DSP_0,Data steward / analyst,JLL,https://nl.indeed.com/rc/clk?jk=32d6b156cbb9d2...,"[[], [], [Wil jij graag bijdragen aan ons succ..."
1,DSP_1,Global Process Director,Cargill,https://nl.indeed.com/rc/clk?jk=f468bb81155e0a...,"[[], [], [Want to build a stronger, more susta..."
2,DSP_2,Human Resources Generalist,THE RENEWAL WORKSHOP,https://nl.indeed.com/rc/clk?jk=cb367910062ed1...,"[[], [], [[<div><div><b>The Renewal Workshop</..."
3,DSP_3,Data Engineer,Optiver,https://nl.indeed.com/rc/clk?jk=1bdc22b275c924...,"[[], [], [Optiver is a leading trading firm dr..."
4,DSP_4,Senior Data Management Analyst Mortgages,ING,https://nl.indeed.com/rc/clk?jk=6ac940b5c663f1...,"[[], [], [[], [<div><p><b>The Unite Tribe Data..."
5,DSP_5,Product Owner Data Management Mortgages,ING,https://nl.indeed.com/rc/clk?jk=deca9994e792aa...,"[[], [], [[], [<div><p><b>The Unite Tribe Data..."
6,DSP_6,RA Manager,RB,https://nl.indeed.com/rc/clk?jk=76a34a12579b88...,"[[], [], [[<div><div><div><h2 class=""jobSectio..."
7,DSP_7,Data Engineer,Carrierecafe,https://nl.indeed.com/rc/clk?jk=f70cb6470c9974...,"[[], [], [[<p>Zonder de Data Engineer geen wer..."
8,DSP_8,Inbound Marketer,24sessions,https://nl.indeed.com/rc/clk?jk=8376556adb3173...,"[[], [], [[<p><b>About 24sessions</b></p>, \n,..."
9,DSP_9,Data Analist,Carrierecafe,https://nl.indeed.com/rc/clk?jk=766d1bdeda95fb...,"[[], [], [[<p>Data Analist. Baan van de toekom..."


In [12]:
# def raw_html_to_clean(raw_html):
#     # get text, remove enter and make lowercase
#     clean_string = BeautifulSoup(raw_html, "html.parser").text.replace('\n', ' ').lower()
    
#     # detect language by scanning first 100 chars
#     b = TextBlob(clean_string[:100])
    
#     # if language is dutch, translate to english
#     if b.detect_language() =='nl':
#         clean_string = translator.translate(clean_string, lang_tgt='en').lower()
        
#     # split into sentences
#     sentences = tokenize.sent_tokenize(clean_string)
    
#     # remove punctuation
#     clean_sentences = [sentence.translate(str.maketrans('', '', string.punctuation)).split() for sentence in sentences]
#     return clean_sentences

In [13]:
# def tfidf(raw_html):
#     clean_string = BeautifulSoup(raw_html, "html.parser").text.replace('\n', ' ')
#     b = TextBlob(clean_string[:100])
    
#     # if language is dutch, translate to english
#     if b.detect_language() =='nl':
#         clean_string = translator.translate(clean_string, lang_tgt='en')
#     return clean_string

In [14]:
# def isNaN(num):
#     return num != num

In [15]:
# def raw_to_clean_csv(csv):
#     path = os.path.abspath(os.getcwd())
    
#     df_csv = pd.read_csv(csv)
#     # remove duplicates
#     clean_df = df_csv.drop_duplicates(subset=['title', 'company'], keep='first').reset_index(drop=True)
    
#     # clean every description into list of lists
#     clean_desciptions = [raw_html_to_clean(raw_html) if not isNaN(raw_html) else None \
#                          for raw_html in clean_df['description']]
    
#     # assign 'description' column with clean descriptions
#     clean_df['description'] = clean_desciptions
    
#     # save csv
#     clean_df.to_csv (f'{path}\clean_dataframe.csv', index = False, header=True)
#     return clean_df

In [16]:
# def raw_to_string(csv):
#     path = os.path.abspath(os.getcwd())
    
#     df_csv = pd.read_csv(csv)
#     clean_df = df_csv.drop_duplicates(subset=['title', 'company', 'salary'], keep='first').reset_index(drop=True)
    
#     strings = [tfidf(raw_html) if not isNaN(raw_html) else None \
#                          for raw_html in clean_df['description']]
    
#     clean_df['description'] = strings
    
#     clean_df.to_csv (f'{path}\string_dataframe.csv', index = False, header=True)
#     return clean_df

In [17]:
#clean_df = raw_to_clean_csv('export_dataframe.csv')

In [18]:
#raw_to_string('export_dataframe.csv')

In [19]:
#clean_df.head()

In [20]:
# def id_from_website(url):
#     soup = extract_jobs(url, 0)
#     meta = soup.findAll('meta')
#     for item in meta:
#         if str(item.get('content')).startswith('https'):
#             link = item.get('content')
#             start = link.find('jk=') + len('jk=') 
#             iden = link[start:]
#         else:
#             iden = None
#     return iden

In [21]:
# def id_from_link(url):
#     start = url.find('jk=') + len('jk=')
#     end = [m.start() for m in re.finditer('&', url)]
#     for i in end:
#         if i > start:
#             end_i = i
#             break
#     iden = url[start:end_i]
#     return iden

In [22]:
"""

"""

'\n\n'