# Imports

In [2]:
# Imports

# Webscraping Imports
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd


# ML library imports
import nltk
from nltk.tokenize import word_tokenize
from nltk import ngrams
import nltk.collocations
from nltk import BigramCollocationFinder
from nltk.probability import FreqDist

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Formatting
from pprint import pprint

# Defining Functions

In [3]:
## Webscraping functions

In [4]:
# Variables
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

In [5]:
def scrape_jobs(request_url, page_limit):
    job_list = []
    for page_num in range(1,page_limit):
        list_url = request_url 
        # Getting response request from list 
        response = requests.get(list_url)
    
        list_data = response.text
        list_soup = BeautifulSoup(list_data, 'html.parser')
        page_jobs = list_soup.find_all("li")
        #10 jobs per page
        #print(len(page_jobs))
        
        ## Get job ID's from each page
        id_list = []
    
        for job in page_jobs:
            base_card_div = job.find("div", {"class": "base-card"})
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
            
        # For every job with ID, get the information
        for job_id in id_list:
            job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
            job_response = requests.get(job_url)
            if(job_response.status_code == 200):
                #print(job_response.status_code)
                job_soup = BeautifulSoup(job_response.text, "html.parser")
                job_post = {}
                try:
                    job_post["days_ago_posted"] = job_soup.find( "span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
                except:
                    job_post["days_ago_posted"] = None
                if(job_post["days_ago_posted"] != None and ("days" in job_post["days_ago_posted"] or "hours" in job_post["days_ago_posted"])): 
                    try:
                        job_post["company_name"] = job_soup.find( "a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
                    except: 
                        job_post["company_name"] = None
                    try:
                        job_post["job_title"] = job_soup.find( "h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
                    except: 
                        job_post["job_title"] = None
                    try:
                        job_post["job_description"] = job_soup.find( "div", {"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" }).text.strip()
                    except: 
                        job_post["job_description"] = None
                    try:
                        job_post["num_applicants"] = job_soup.find( "figcaption", {"class": "num-applicants__caption"}).text.strip()
                    except:
                        job_post["num_applicants"] = None
                    job_list.append(job_post)
    return pd.DataFrame(job_list)
        

In [6]:
jobs_df = scrape_jobs(request_url, 2)

In [7]:
jobs_df

Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants
0,6 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants
1,5 days ago,Wealthfront,"Data Scientist, Investing",The Wealthfront Data Science Team utilizes our...,Over 200 applicants


## ML Functions

In [14]:
def create_stop_words():
    stop_words = set(stopwords.words("english"))
    stop_words.add(':')
    stop_words.add(',')
    stop_words.add('.')
    stop_words.add(', ')
    stop_words.add('. ')
    stop_words.add('*')
    stop_words.add("'")
    stop_words.add("'s")
    stop_words.add('e.g')
    stop_words.add('employees')
    stop_words.add('applicants')
    stop_words.add(')')
    stop_words.add('(')
    stop_words.add('eligible')
    stop_words.add('participate')
    stop_words.update(['contract','salary', 'range', 'sunnyvale', 'candidates', 'recruiting', 'hiring', 'fair', 'chance'])
    return(stop_words)

In [15]:
stop_words = create_stop_words()

In [16]:
def get_n_grams(word_list, n, top_n):
    '''
        Function to return ngrams
        word_list: pass lowercase word list (filtered for stop words)
        n: the number of words in each phrase (gram)
        top_n: top n number of matches 
    '''
    grams = list(ngrams(word_list, n))
    freq_dist = FreqDist(grams)
    
    topn = freq_dist.most_common(top_n)
    return(topn)

In [17]:
def return_n_grams():
    bigram_list = []
    # trigram_list = []
    
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    # trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # For bigrams
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    return (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))

In [18]:
def get_bigrams_trigrams(jobs_df):

    bigram_list = []
    trigram_list = []
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    
        # add trigrams
        top5 = get_n_grams(filtered_list, 3, 2)
        bigram_list.append(top5)
        for tup in top5:
            if tup[0] not in bigram_dict:
                trigram_dict[tup[0]] = tup[1]
            else:
                trigram_dict[tup[0]] += tup[1]
    
    sorted_bigram = (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))
    sorted_trigram = (sorted(trigram_dict.items(), key=lambda item: item[1],  reverse=True))
    return(sorted_bigram, sorted_trigram)

In [19]:
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

In [20]:
pprint(sorted_bigram[:100])

[(('machine', 'learning'), 7),
 (('wealthfront', 'brokerage'), 5),
 (('data', 'science'), 4),
 (('wealthfront', 'advisers'), 4),
 (('project', 'plans'), 3),
 (('cash', 'account'), 3),
 (('software', 'engineering'), 2),
 (('engineering', 'best'), 2),
 (('best', 'practices'), 2),
 (('practices', 'version'), 2),
 (('version', 'control'), 2),
 (('control', 'testing'), 2),
 (('testing', 'code'), 2),
 (('science', 'team'), 2),
 (('computer', 'science'), 2),
 (('science', 'statistics'), 2)]


# Business Analyst Positions

In [21]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Business%2BAnalyst&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, 200)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])



[(('per', 'calendar'), 515),
 (('calendar', 'year'), 515),
 (('&', 'johnson'), 412),
 (('supply', 'chain'), 412),
 (('hours', 'per'), 412),
 (('web', 'analytics'), 388),
 (('johnson', '&'), 309),
 (('smarter', 'less'), 309),
 (('less', 'invasive'), 309),
 (('aditi', 'consulting'), 291),
 (('business', 'operations'), 194),
 (('operations', 'specialist'), 194),
 (('data', 'analysis'), 194),
 (('a/b', 'testing'), 194),
 (('analysis', 'using'), 194),
 (('best', 'practices'), 194),
 (('business', 'analyst'), 10),
 (('headcount', 'data'), 5),
 (('data', 'management'), 5),
 (('years', 'experience'), 4),
 (('quality', 'business'), 4),
 (('requirement', 'documents'), 4),
 (('tubi', 'offers'), 4),
 (('data', 'systems'), 3),
 (('business', 'requirement'), 3),
 (('servicenow', 'development'), 3),
 (('development', 'team'), 3),
 (('understanding', 'servicenow'), 3),
 (('business', 'requirements'), 3),
 (('user', 'stories'), 3),
 (('employee', 'benefits'), 3),
 (('-', 'open'), 2),
 (('experience', '

# Data Science Positions

In [35]:
page_limit = 3
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"
job_list = []
for page_num in range(1,page_limit):
    request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"
    list_url = request_url 
    # Getting response request from list 
    response = requests.get(list_url)

    list_data = response.text
    list_soup = BeautifulSoup(list_data, 'html.parser')
    page_jobs = list_soup.find_all("li")
    
    id_list = []

    for job in page_jobs:
        base_card_div = job.find("div", {"class": "base-card"})
        job_id = base_card_div.get("data-entity-urn").split(":")[3]
        id_list.append(job_id)
    print(id_list)
# jobs_df = scrape_jobs(request_url, 2)
# print(jobs_df)
# (sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

# pprint(sorted_bigram[:100])

['4143307444', '4034434518', '4187176363', '4178571211', '4191815023', '4191828793', '4195038774', '4152204588', '4172905983', '4187902063']
['4034434518', '4187176363', '4178571211', '4191815023', '4191828793', '4152204588', '4172905983', '4187902063', '4184821866', '4200279036']


# Business Engineer Positions

In [27]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Business%2BEngineer&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, 2)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])

[]


# Data Analyst Positions

In [24]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BAnalyst&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, 200)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])

KeyboardInterrupt: 