# Imports

In [15]:
# Imports

# Webscraping Imports
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd


# ML library imports
import nltk
from nltk.tokenize import word_tokenize
from nltk import ngrams
import nltk.collocations
from nltk import BigramCollocationFinder
from nltk.probability import FreqDist

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Formatting
from pprint import pprint

# Defining Functions

In [None]:
## Webscraping functions

In [6]:
# Variables
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

In [4]:
def scrape_jobs(request_url, page_limit):
    job_list = []
    for page_num in range(1,page_limit):
        list_url = request_url 
        # Getting response request from list 
        response = requests.get(list_url)
    
        list_data = response.text
        list_soup = BeautifulSoup(list_data, 'html.parser')
        page_jobs = list_soup.find_all("li")
        #10 jobs per page
        #print(len(page_jobs))
        
        ## Get job ID's from each page
        id_list = []
    
        for job in page_jobs:
            base_card_div = job.find("div", {"class": "base-card"})
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
            
        # For every job with ID, get the information
        for job_id in id_list:
            job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
            job_response = requests.get(job_url)
            if(job_response.status_code == 200):
                #print(job_response.status_code)
                job_soup = BeautifulSoup(job_response.text, "html.parser")
                job_post = {}
                try:
                    job_post["days_ago_posted"] = job_soup.find( "span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
                except:
                    job_post["days_ago_posted"] = None
                if(job_post["days_ago_posted"] != None and ("days" in job_post["days_ago_posted"] or "hours" in job_post["days_ago_posted"])): 
                    try:
                        job_post["company_name"] = job_soup.find( "a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
                    except: 
                        job_post["company_name"] = None
                    try:
                        job_post["job_title"] = job_soup.find( "h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
                    except: 
                        job_post["job_title"] = None
                    try:
                        job_post["job_description"] = job_soup.find( "div", {"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" }).text.strip()
                    except: 
                        job_post["job_description"] = None
                    try:
                        job_post["num_applicants"] = job_soup.find( "figcaption", {"class": "num-applicants__caption"}).text.strip()
                    except:
                        job_post["num_applicants"] = None
                    job_list.append(job_post)
    return pd.DataFrame(job_list)
        

In [8]:
jobs_df = scrape_jobs(request_url, 2)

In [9]:
jobs_df

Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants
0,3 days ago,Wealthfront,"Data Scientist, Marketing",The Wealthfront Data Science team utilizes our...,Over 200 applicants
1,4 days ago,Notion,"Data Scientist, Growth",About UsWe're on a mission to make it possible...,Over 200 applicants
2,5 days ago,Persistent Systems,Data Scientist,About PersistentWe are a trusted Digital Engin...,Over 200 applicants
3,6 days ago,LinkedIn,AI Engineer,LinkedIn is the world’s largest professional n...,Over 200 applicants
4,2 days ago,Brex,"Data Scientist II, Credit",Why join usBrex is the AI-powered spend platfo...,


## ML Functions

In [10]:
def create_stop_words():
    stop_words = set(stopwords.words("english"))
    stop_words.add(':')
    stop_words.add(',')
    stop_words.add('.')
    stop_words.add(', ')
    stop_words.add('. ')
    stop_words.add('*')
    stop_words.add("'")
    stop_words.add("'s")
    stop_words.add('e.g')
    stop_words.add('employees')
    stop_words.add('applicants')
    stop_words.add(')')
    stop_words.add('(')
    stop_words.add('eligible')
    stop_words.add('participate')
    return(stop_words)

In [11]:
stop_words = create_stop_words()

In [12]:
def get_n_grams(word_list, n, top_n):
    '''
        Function to return ngrams
        word_list: pass lowercase word list (filtered for stop words)
        n: the number of words in each phrase (gram)
        top_n: top n number of matches 
    '''
    grams = list(ngrams(word_list, n))
    freq_dist = FreqDist(grams)
    
    topn = freq_dist.most_common(top_n)
    return(topn)

In [13]:
def return_n_grams():
    bigram_list = []
    # trigram_list = []
    
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    # trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # For bigrams
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    return (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))

In [18]:
def get_bigrams_trigrams(jobs_df):

    bigram_list = []
    trigram_list = []
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    
        # add trigrams
        top5 = get_n_grams(filtered_list, 3, 2)
        bigram_list.append(top5)
        for tup in top5:
            if tup[0] not in bigram_dict:
                trigram_dict[tup[0]] = tup[1]
            else:
                trigram_dict[tup[0]] += tup[1]
    
    sorted_bigram = (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))
    sorted_trigram = (sorted(trigram_dict.items(), key=lambda item: item[1],  reverse=True))
    return(sorted_bigram, sorted_trigram)

In [19]:
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

In [20]:
pprint(sorted_bigram[:100])

[(('credit', 'risk'), 8),
 (('data', 'science'), 6),
 (('machine', 'learning'), 6),
 (('wealthfront', 'brokerage'), 5),
 (('wealthfront', 'advisers'), 4),
 (('data', 'scientist'), 4),
 (('marketing', 'team'), 3),
 (('data', 'scientists'), 3),
 (('cash', 'account'), 3),
 (('1+', 'years'), 3),
 (('equal', 'opportunity'), 3),
 (('risk', 'management'), 3),
 (('computer', 'science'), 2),
 (('natural', 'sciences'), 2),
 (('statistical', 'inference'), 2),
 (('inference', 'experimentation'), 2),
 (('committed', 'providing'), 2),
 (('see', 'beyond'), 2),
 (('beyond', 'rise'), 2),
 (('nifty', 'midcap'), 2),
 (('ability', 'go'), 2),
 (('go', 'deep'), 2),
 (('deep', 'complex'), 2),
 (('complex', 'vague'), 2),
 (('role', 'based'), 2),
 (('based', 'sunnyvale'), 2),
 (('sunnyvale', 'san'), 2),
 (('san', 'francisco'), 2),
 (('francisco', 'bellevue'), 2),
 (('using', 'data'), 2),
 (('closely', 'credit'), 2),
 (('credit', 'enterprise'), 2),
 (('enterprise', 'risk'), 2),
 (('analytical', 'frameworks'), 2

# Business Analyst Positions

In [23]:
page_limit = 200
page_num = 1
request_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Business%2BAnalyst&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}"

jobs_df = scrape_jobs(request_url, 200)
(sorted_bigram, sorted_trigram) = get_bigrams_trigrams(jobs_df)

pprint(sorted_bigram[:100])

[(('solutions', 'inc.'), 183),
 (('regression', 'testing'), 122),
 (('business', 'analystw2'), 61),
 (('analystw2', 'contractsalary'), 61),
 (('contractsalary', 'range'), 61),
 (('range', '$'), 61),
 (('$', '83,200'), 61),
 (('83,200', '-'), 61),
 (('computer', 'science/statistics'), 56),
 (('sunnyvale', 'ca-hybridmust'), 28),
 (('ca-hybridmust', 'skills-need'), 28),
 (('skills-need', 'candidates'), 28),
 (('candidates', 'strong'), 28),
 (('strong', 'sql/python'), 28),
 (('sql/python', 'background'), 28),
 (('background', 'well'), 28),
 (('marketing', 'strategy'), 10),
 (('product', 'marketing'), 8),
 (('b2b', 'marketing'), 6),
 (('content', 'strategy'), 6),
 (('recruiting', 'hiring'), 6),
 (('chemistry', 'cheminformatics'), 5),
 (('online', 'travel'), 4),
 (('strategy', 'analyst'), 4),
 (('analyst', 'b2b'), 4),
 (('b2b', 'content'), 4),
 (('small', 'molecule'), 4),
 (('molecule', 'design'), 4),
 (('years', 'chemistry'), 4),
 (('cheminformatics', 'life'), 4),
 (('life', 'sciences'), 4)