# Imports

In [1]:
# Imports

# Webscraping Imports
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd


# ML library imports
import nltk
from nltk.tokenize import word_tokenize
from nltk import ngrams
import nltk.collocations
from nltk import BigramCollocationFinder
from nltk.probability import FreqDist

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Formatting
from pprint import pprint

# Defining Functions

In [2]:
## Webscraping functions

In [3]:
# Variables
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

In [4]:
def scrape_jobs(request_url, page_limit):
    job_list = []
    for page_num in range(1,page_limit):
        list_url = request_url 
        # Getting response request from list 
        response = requests.get(list_url)
    
        list_data = response.text
        list_soup = BeautifulSoup(list_data, 'html.parser')
        page_jobs = list_soup.find_all("li")
        #10 jobs per page
        #print(len(page_jobs))
        
        ## Get job ID's from each page
        id_list = []
    
        for job in page_jobs:
            base_card_div = job.find("div", {"class": "base-card"})
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
            
        # For every job with ID, get the information
        for job_id in id_list:
            job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
            job_response = requests.get(job_url)
            if(job_response.status_code == 200):
                #print(job_response.status_code)
                job_soup = BeautifulSoup(job_response.text, "html.parser")
                job_post = {}
                try:
                    job_post["days_ago_posted"] = job_soup.find( "span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
                except:
                    job_post["days_ago_posted"] = None
                # if(job_post["days_ago_posted"] != None and ("days" in job_post["days_ago_posted"] or "hours" in job_post["days_ago_posted"])): 
                try:
                    job_post["company_name"] = job_soup.find( "a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
                except: 
                    job_post["company_name"] = None
                try:
                    job_post["job_title"] = job_soup.find( "h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
                except: 
                    job_post["job_title"] = None
                try:
                    job_post["job_description"] = job_soup.find( "div", {"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" }).text.strip()
                except: 
                    job_post["job_description"] = None
                try:
                    job_post["num_applicants"] = job_soup.find( "figcaption", {"class": "num-applicants__caption"}).text.strip()
                except:
                    job_post["num_applicants"] = None
                job_list.append(job_post)
    return pd.DataFrame(job_list)
        

In [5]:
jobs_df = scrape_jobs(request_url, 2)

In [6]:
jobs_df

Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants
0,1 week ago,Notion,"Data Scientist, Growth",About UsWe're on a mission to make it possible...,Over 200 applicants
1,2 days ago,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",Be among the first 25 applicants
2,5 days ago,Wealthfront,"Data Scientist, Marketing",The Wealthfront Data Science team utilizes our...,
3,2 days ago,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",Be among the first 25 applicants
4,,LinkedIn,"Staff Data Scientist, Strategy & Insights",LinkedIn is the world’s largest professional n...,
5,2 days ago,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",
6,2 weeks ago,Wealthfront,"Data Scientist, Investing",The Wealthfront Data Science Team utilizes our...,Over 200 applicants
7,2 days ago,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",
8,2 days ago,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",
9,2 days ago,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",


## ML Functions

In [7]:
def create_stop_words():
    stop_words = set(stopwords.words("english"))
    stop_words.add(':')
    stop_words.add(',')
    stop_words.add('.')
    stop_words.add(', ')
    stop_words.add('. ')
    stop_words.add('*')
    stop_words.add("'")
    stop_words.add("'s")
    stop_words.add('e.g')
    stop_words.add('employees')
    stop_words.add('applicants')
    stop_words.add(')')
    stop_words.add('(')
    stop_words.add('eligible')
    stop_words.add('participate')
    stop_words.update(['contract','salary', 'range', 'sunnyvale', 'candidates', 'recruiting', 'hiring', 'fair', 'chance'])
    return(stop_words)

In [8]:
stop_words = create_stop_words()

In [9]:
def get_n_grams(word_list, n, top_n):
    '''
        Function to return ngrams
        word_list: pass lowercase word list (filtered for stop words)
        n: the number of words in each phrase (gram)
        top_n: top n number of matches 
    '''
    grams = list(ngrams(word_list, n))
    freq_dist = FreqDist(grams)
    
    topn = freq_dist.most_common(top_n)
    return(topn)

In [10]:
def return_n_grams():
    bigram_list = []
    # trigram_list = []
    
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    # trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # For bigrams
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    return (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))

In [11]:
def get_bigrams_trigrams(jobs_df):

    bigram_list = []
    trigram_list = []
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    
        # add trigrams
        top5 = get_n_grams(filtered_list, 3, 2)
        bigram_list.append(top5)
        for tup in top5:
            if tup[0] not in bigram_dict:
                trigram_dict[tup[0]] = tup[1]
            else:
                trigram_dict[tup[0]] += tup[1]
    
    sorted_bigram = (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))
    sorted_trigram = (sorted(trigram_dict.items(), key=lambda item: item[1],  reverse=True))
    return(sorted_bigram, sorted_trigram)

In [12]:
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

In [13]:
pprint(sorted_bigram[:100])

[(('shape', 'future'), 12),
 (('products', 'build'), 12),
 (('data', 'sets'), 12),
 (('billions', 'people'), 12),
 (('people', 'hundreds'), 12),
 (('hundreds', 'millions'), 12),
 (('around', 'world'), 12),
 (('wide', 'array'), 12),
 (('wealthfront', 'brokerage'), 10),
 (('wealthfront', 'advisers'), 8),
 (('cash', 'account'), 6),
 (('data', 'science'), 6),
 (('computer', 'science'), 4),
 (('marketing', 'team'), 3),
 (('data', 'scientists'), 3),
 (('linkedin', 'committed'), 3),
 (('cross-functional', 'teams'), 3),
 (('project', 'plans'), 3),
 (('data', 'scientist'), 2),
 (('statistical', 'inference'), 2),
 (('inference', 'experimentation'), 2),
 (('committed', 'providing'), 2),
 (('natural', 'sciences'), 2),
 (('backgrounds', 'experiences'), 2),
 (('economic', 'opportunity'), 2),
 (('opportunity', 'every'), 2),
 (('every', 'member'), 2),
 (('member', 'global'), 2),
 (('global', 'workforce'), 2),
 (('science', 'team'), 2),
 (('science', 'statistics'), 2),
 (('uswe', "'re"), 1),
 (("'re", 

# Business Analyst Positions

In [20]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Business%2BAnalyst&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, page_limit)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])



[(('business', 'analyst'), 1059),
 (('cutting-edge', 'technology'), 468),
 (('government', 'innovation'), 468),
 (('new', 'existing'), 468),
 (('requirements', 'develop'), 468),
 (('team', 'members'), 468),
 (('products', 'services'), 468),
 (('must', 'able'), 468),
 (('government', 'services'), 468),
 (('salesforce', 'business'), 357),
 (('tiktok', 'monetization'), 351),
 (('data', 'science'), 351),
 (('decision', 'making'), 351),
 (('los', 'angeles'), 351),
 (('tiktok', 'mission'), 351),
 (('inspire', 'creativity'), 351),
 (('creativity', 'bring'), 351),
 (('bring', 'joy'), 351),
 (('computer', 'science/statistics'), 246),
 (('software', 'operations'), 238),
 (('operations', 'business'), 238),
 (('applicable', 'systems'), 238),
 (('software', 'support'), 238),
 (('support', 'services'), 238),
 (('senior', 'salesforce'), 238),
 (('teams', 'deliver'), 238),
 (('5v', 'tech'), 238),
 (('$', '250'), 238),
 (('data', 'analysis'), 236),
 (('machine', 'learning'), 236),
 (('opportunity', 'of

# Data Science Positions

In [21]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, page_limit)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])

[(('shape', 'future'), 1306),
 (('products', 'build'), 1306),
 (('data', 'sets'), 1306),
 (('billions', 'people'), 1306),
 (('people', 'hundreds'), 1306),
 (('hundreds', 'millions'), 1306),
 (('around', 'world'), 1306),
 (('wide', 'array'), 1306),
 (('wealthfront', 'brokerage'), 1135),
 (('wealthfront', 'advisers'), 908),
 (('data', 'science'), 691),
 (('cash', 'account'), 681),
 (('computer', 'science'), 454),
 (('project', 'plans'), 342),
 (('marketing', 'team'), 339),
 (('data', 'scientists'), 339),
 (('linkedin', 'committed'), 324),
 (('cross-functional', 'teams'), 324),
 (('data', 'scientist'), 228),
 (('statistical', 'inference'), 228),
 (('inference', 'experimentation'), 228),
 (('committed', 'providing'), 228),
 (('science', 'team'), 228),
 (('science', 'statistics'), 228),
 (('natural', 'sciences'), 226),
 (('backgrounds', 'experiences'), 216),
 (('economic', 'opportunity'), 216),
 (('opportunity', 'every'), 216),
 (('every', 'member'), 216),
 (('member', 'global'), 216),
 (('

# Business Engineer Positions

In [22]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Business%2BEngineer&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, page_limit)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])

[(('product', 'engineering'), 636),
 (('engineering', 'teams'), 562),
 (('meet', 'business'), 429),
 (('open', 'source'), 418),
 (('business', 'development'), 414),
 (('field', 'service'), 396),
 (('development', 'manager'), 348),
 (('business', 'engineering'), 286),
 (('meta', 'platforms'), 286),
 (('partners', 'meet'), 286),
 (('code', 'reviews'), 286),
 (('qualifications', 'experience'), 278),
 (('supply', 'chain'), 260),
 (('cross-functional', 'teams'), 216),
 (('utilities', 'energy'), 198),
 (('service', 'management'), 198),
 (('fsm', 'solutions'), 198),
 (('business', 'processes'), 198),
 (('skills', 'ability'), 198),
 (('reset', 'initiative'), 195),
 (('pdt', 'procurement'), 195),
 (('procurement', 'factory'), 195),
 (('application', 'deadline'), 195),
 (('business', 'analysis'), 192),
 (('ai', 'partnerships'), 144),
 (('closely', 'product'), 144),
 (('computer', 'science/statistics'), 142),
 (('machine', 'learning'), 138),
 (('business', 'operations'), 136),
 (('operations', 's

# Data Analyst Positions

In [23]:
page_limit = 200

request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BAnalyst&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, page_limit)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])

[(('years', 'experience'), 104),
 (('adobe', 'analytics'), 99),
 (('tiktok', 'mission'), 66),
 (('inspire', 'creativity'), 66),
 (('creativity', 'bring'), 66),
 (('bring', 'joy'), 66),
 (('los', 'angeles'), 66),
 (('usa', ';'), 54),
 (('data', 'sets'), 54),
 (('business', 'data'), 54),
 (('new', 'ims'), 45),
 (('marketing', 'partners'), 39),
 (('data', 'collection'), 36),
 (('report', 'suites'), 36),
 (('calculated', 'metrics'), 36),
 (('metrics', 'segments'), 36),
 (('adobe', 'launch'), 36),
 (('ca', 'usa'), 36),
 (('’', 'degree'), 36),
 (('etc', '.3'), 36),
 (('.3', 'years'), 36),
 (('measurement', 'research'), 33),
 (('partner', 'measurement'), 33),
 (('global', 'partnerships'), 32),
 (('grow', 'businesses'), 32),
 (('marketing', 'strategies'), 32),
 (('product', 'legal'), 32),
 (('impact', 'marketing'), 32),
 (('local', 'san'), 30),
 (('san', 'francisco'), 30),
 (('francisco', 'bay'), 30),
 (('adobe', 'experience'), 27),
 (('effect', 'products'), 27),
 (('data', 'analysis'), 27),
 