## This notebook is a clone of "Analysis of Data Science Positions on LinkedIn"

In this notebook, I try to do a weekly refresh of jobs that have been posted only in the last week.

In [6]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd

## The following cell collects linkedin job data and only adds jobs that have been posted in the past week to the dataframe

In [11]:
## Creating loop to go through all pages
job_list = []
for page_num in range(1,200):
    list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}" 
    # Getting response request from list 
    response = requests.get(list_url)

    list_data = response.text
    list_soup = BeautifulSoup(list_data, 'html.parser')
    page_jobs = list_soup.find_all("li")
    #10 jobs per page
    #print(len(page_jobs))
    
    ## Get job ID's from each page
    id_list = []

    for job in page_jobs:
        base_card_div = job.find("div", {"class": "base-card"})
        job_id = base_card_div.get("data-entity-urn").split(":")[3]
        id_list.append(job_id)
        
    # For every job with ID, get the information
    for job_id in id_list:
        job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
        job_response = requests.get(job_url)
        if(job_response.status_code == 200):
            #print(job_response.status_code)
            job_soup = BeautifulSoup(job_response.text, "html.parser")
            job_post = {}
            try:
                job_post["days_ago_posted"] = job_soup.find( "span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
            except:
                job_post["days_ago_posted"] = None
            if(job_post["days_ago_posted"] != None and ("days" in job_post["days_ago_posted"] or "hours" in job_post["days_ago_posted"])): 
                try:
                    job_post["company_name"] = job_soup.find( "a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
                except: 
                    job_post["company_name"] = None
                try:
                    job_post["job_title"] = job_soup.find( "h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
                except: 
                    job_post["job_title"] = None
                try:
                    job_post["job_description"] = job_soup.find( "div", {"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" }).text.strip()
                except: 
                    job_post["job_description"] = None
                try:
                    job_post["num_applicants"] = job_soup.find( "figcaption", {"class": "num-applicants__caption"}).text.strip()
                except:
                    job_post["num_applicants"] = None
                job_list.append(job_post)




In [12]:
jobs_df = pd.DataFrame(job_list)
jobs_df

Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants
0,2 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants
1,4 days ago,Notion,"Data Scientist, Finance",About UsWe're on a mission to make it possible...,Over 200 applicants
2,6 days ago,Ikigai,AI/ML Engineer,Company DescriptionThe Ikigai platform unlocks...,Over 200 applicants
3,5 days ago,LinkedIn,"Staff Data Scientist, Strategy & Insights",LinkedIn is the world’s largest professional n...,Over 200 applicants
4,2 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants
...,...,...,...,...,...
150,4 days ago,Verily,Data Scientist Evaluation Intern,Who We AreVerily is a subsidiary of Alphabet t...,Over 200 applicants
151,5 days ago,Bosch USA,AI Research Scientist – GenAI,Company DescriptionThe Bosch Research and Tech...,
152,4 days ago,Scion Staffing,AI/ML Engineer,Scion Technology Staffing has been engaged to ...,
153,4 days ago,Verily,Data Scientist Evaluation Intern,Who We AreVerily is a subsidiary of Alphabet t...,Over 200 applicants


In [13]:
jobs_df.to_csv('data_01222025.csv', index=False)

## From here on we will use the data_01222025.csv file that we exported the data to

In [14]:
import os
import pandas as pd
os.getcwd() 

'/Users/toriwang/Documents/GitHub/WebScraping-Projects'

In [15]:
import sys
print(sys.executable)

/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10


In [16]:
jobs_df = pd.read_csv('data_01222025.csv') 

In [17]:
print(type(jobs_df))

<class 'pandas.core.frame.DataFrame'>


In [39]:
#getting intern positions
jobs_df['Intern_position'] = (jobs_df['job_title'].str.contains('Intern')
                                       .map({True : 'yes', False : 'no'}))

In [41]:
jobs_df

Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants,Intern_position
0,2 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants,yes
1,4 days ago,Notion,"Data Scientist, Finance",About UsWe're on a mission to make it possible...,Over 200 applicants,no
2,6 days ago,Ikigai,AI/ML Engineer,Company DescriptionThe Ikigai platform unlocks...,Over 200 applicants,no
3,5 days ago,LinkedIn,"Staff Data Scientist, Strategy & Insights",LinkedIn is the world’s largest professional n...,Over 200 applicants,no
4,2 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants,yes
...,...,...,...,...,...,...
150,4 days ago,Verily,Data Scientist Evaluation Intern,Who We AreVerily is a subsidiary of Alphabet t...,Over 200 applicants,yes
151,5 days ago,Bosch USA,AI Research Scientist – GenAI,Company DescriptionThe Bosch Research and Tech...,,no
152,4 days ago,Scion Staffing,AI/ML Engineer,Scion Technology Staffing has been engaged to ...,,no
153,4 days ago,Verily,Data Scientist Evaluation Intern,Who We AreVerily is a subsidiary of Alphabet t...,Over 200 applicants,yes


# Machine Learning Techniques

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import ngrams
import nltk.collocations
from nltk import BigramCollocationFinder
from nltk.probability import FreqDist

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [13]:
stop_words = set(stopwords.words("english"))
stop_words.add(':')
stop_words.add(',')
stop_words.add('.')
stop_words.add(', ')
stop_words.add('. ')
stop_words.add('*')
stop_words.add("'")
stop_words.add("'s")
stop_words.add('e.g')
stop_words.add('employees')
stop_words.add('applicants')
stop_words.add(')')
stop_words.add('(')
stop_words.add('eligible')
stop_words.add('participate')

In [14]:

def get_n_grams(word_list, n, top_n):
    '''
        Function to return ngrams
        word_list: pass lowercase word list (filtered for stop words)
        n: the number of words in each phrase (gram)
        top_n: top n number of matches 
    '''
    grams = list(ngrams(word_list, n))
    freq_dist = FreqDist(grams)
    
    topn = freq_dist.most_common(top_n)
    return(topn)


In [15]:
def return_n_grams():
    bigram_list = []
    # trigram_list = []
    
    #Create a dictionary of the most common bigrams
    bigram_dict = {}
    # trigram_dict = {}
    
    #for every job in job_list dataframe
    for i in range(len(jobs_df)): # change to len(jobs_df)
        # print(jobs_df['job_description'][i], '\n')
        # Grab the job description
        text = jobs_df['job_description'][i]
    
        #Tokenize and filter unnecessary words, and put relevant words i
        tokens = word_tokenize(text)
        filtered_list = []
        for word in tokens:
            if word.casefold() not in stop_words:
                filtered_list.append(word.lower())  
        
        # For bigrams
        # add bigrams
        top8 = get_n_grams(filtered_list, 2, 8)
        bigram_list.append(top8)
        for tup in top8:
            if tup[0] not in bigram_dict:
                bigram_dict[tup[0]] = tup[1]
            else:
                bigram_dict[tup[0]] += tup[1]
    return (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))

In [21]:
## In this cell I'm going to add the list of common tokens to the dataframe

bigram_list = []
trigram_list = []

#Create a dictionary of the most common bigrams
bigram_dict = {}
trigram_dict = {}

#for every job in job_list dataframe
for i in range(len(jobs_df)): # change to len(jobs_df)
    # print(jobs_df['job_description'][i], '\n')
    # Grab the job description
    text = jobs_df['job_description'][i]

    #Tokenize and filter unnecessary words, and put relevant words i
    tokens = word_tokenize(text)
    filtered_list = []
    for word in tokens:
        if word.casefold() not in stop_words:
            filtered_list.append(word.lower())  
    
    # For bigrams
    # add bigrams
    top8 = get_n_grams(filtered_list, 2, 8)
    bigram_list.append(top8)
    for tup in top8:
        if tup[0] not in bigram_dict:
            bigram_dict[tup[0]] = tup[1]
        else:
            bigram_dict[tup[0]] += tup[1]

    # add trigrams
    top5 = get_n_grams(filtered_list, 3, 2)
    bigram_list.append(top5)
    for tup in top5:
        if tup[0] not in bigram_dict:
            trigram_dict[tup[0]] = tup[1]
        else:
            trigram_dict[tup[0]] += tup[1]


In [22]:
sorted_bigram = (sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))
sorted_trigram = (sorted(trigram_dict.items(), key=lambda item: item[1],  reverse=True))

In [23]:
from pprint import pprint
pprint(sorted_bigram[:100])

[(('machine', 'learning'), 223),
 (('data', 'science'), 93),
 (('equal', 'opportunity'), 53),
 (('deep', 'learning'), 40),
 (('model', 'training'), 36),
 (('cross-functional', 'teams'), 35),
 (('usa', ';'), 35),
 (('&', 'ai'), 30),
 (('years', 'experience'), 28),
 (('intercom', '’'), 27),
 (('fp', '&'), 27),
 (('tecton', '’'), 24),
 (('use', 'cases'), 24),
 (('least', '18+'), 24),
 (('18+', 'years'), 24),
 (('enrolled', 'full-time'), 24),
 (('full-time', 'student'), 24),
 (('generative', 'ai'), 24),
 (('ikigai', 'labs'), 21),
 (('linkedin', 'committed'), 21),
 (('opportunity', 'employer'), 21),
 (('google', '’'), 21),
 (('insight', 'cloud'), 21),
 (('analytics', 'apps'), 21),
 (('business', 'metrics'), 20),
 (('new', 'york'), 18),
 (('’', 'genai'), 18),
 (('genai', 'use'), 18),
 (('learning', 'ai'), 18),
 (('founding', 'team'), 18),
 (('deployment', 'monitoring'), 18),
 (('us', '!'), 18),
 (('eds', '&'), 18),
 (('voice', 'ai'), 18),
 (('ai', 'technology'), 18),
 (('ai', 'models'), 18),

# Are there any jobs without over 200 applicants lol

In [42]:
jobs_df.loc[jobs_df['num_applicants'] != "Over 200 applicants"]


Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants,Intern_position
35,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
40,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
43,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
46,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
48,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
60,4 days ago,Scion Staffing,AI/ML Engineer,Scion Technology Staffing has been engaged to ...,,no
66,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
67,6 days ago,seer,AI/ML Researcher,We are currently working with a Biotech Unicor...,,no
73,4 days ago,Diamond Foundry,Data Scientist,Diamond Foundry Inc. is solving the thermal li...,,no
80,5 days ago,Google,"Data Scientist, Research, Power Forecasting an...",Note: By applying to this position you will ha...,,no


In [43]:
jobs_df[jobs_df["Intern_position"].str.contains("no")]


Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants,Intern_position
1,4 days ago,Notion,"Data Scientist, Finance",About UsWe're on a mission to make it possible...,Over 200 applicants,no
2,6 days ago,Ikigai,AI/ML Engineer,Company DescriptionThe Ikigai platform unlocks...,Over 200 applicants,no
3,5 days ago,LinkedIn,"Staff Data Scientist, Strategy & Insights",LinkedIn is the world’s largest professional n...,Over 200 applicants,no
5,4 days ago,Notion,"Data Scientist, Finance",About UsWe're on a mission to make it possible...,Over 200 applicants,no
6,6 days ago,Ikigai,AI/ML Engineer,Company DescriptionThe Ikigai platform unlocks...,Over 200 applicants,no
...,...,...,...,...,...,...
148,6 days ago,Saicon,Machine Learning Artificial Intelligence Engineer,Need : Machine Learning Artificial Intelligenc...,Over 200 applicants,no
149,5 days ago,Foundation Model Startup,AI Engineer,Our workWe are a well capitalized stealth VC-b...,Over 200 applicants,no
151,5 days ago,Bosch USA,AI Research Scientist – GenAI,Company DescriptionThe Bosch Research and Tech...,,no
152,4 days ago,Scion Staffing,AI/ML Engineer,Scion Technology Staffing has been engaged to ...,,no


# Weekly Refresh 3/17/2024

In [4]:
def scrape_jobs():
    job_list = []
    for page_num in range(1,200):
        list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}" 
        # Getting response request from list 
        response = requests.get(list_url)
    
        list_data = response.text
        list_soup = BeautifulSoup(list_data, 'html.parser')
        page_jobs = list_soup.find_all("li")
        #10 jobs per page
        #print(len(page_jobs))
        
        ## Get job ID's from each page
        id_list = []
    
        for job in page_jobs:
            base_card_div = job.find("div", {"class": "base-card"})
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
            
        # For every job with ID, get the information
        for job_id in id_list:
            job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
            job_response = requests.get(job_url)
            if(job_response.status_code == 200):
                #print(job_response.status_code)
                job_soup = BeautifulSoup(job_response.text, "html.parser")
                job_post = {}
                try:
                    job_post["days_ago_posted"] = job_soup.find( "span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
                except:
                    job_post["days_ago_posted"] = None
                if(job_post["days_ago_posted"] != None and ("days" in job_post["days_ago_posted"] or "hours" in job_post["days_ago_posted"])): 
                    try:
                        job_post["company_name"] = job_soup.find( "a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
                    except: 
                        job_post["company_name"] = None
                    try:
                        job_post["job_title"] = job_soup.find( "h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
                    except: 
                        job_post["job_title"] = None
                    try:
                        job_post["job_description"] = job_soup.find( "div", {"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" }).text.strip()
                    except: 
                        job_post["job_description"] = None
                    try:
                        job_post["num_applicants"] = job_soup.find( "figcaption", {"class": "num-applicants__caption"}).text.strip()
                    except:
                        job_post["num_applicants"] = None
                    job_list.append(job_post)
    return pd.DataFrame(job_list)
        

In [7]:
jobs_df = scrape_jobs()

In [8]:
jobs_df.to_csv('data_20250317.csv', index=False)

In [9]:
import os
import pandas as pd
os.getcwd() 

'/Users/toriwang/Documents/GitHub/WebScraping-Projects'

In [10]:
jobs_df = pd.read_csv('data_01222025.csv') 

In [11]:
jobs_df

Unnamed: 0,days_ago_posted,company_name,job_title,job_description,num_applicants
0,2 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants
1,4 days ago,Notion,"Data Scientist, Finance",About UsWe're on a mission to make it possible...,Over 200 applicants
2,6 days ago,Ikigai,AI/ML Engineer,Company DescriptionThe Ikigai platform unlocks...,Over 200 applicants
3,5 days ago,LinkedIn,"Staff Data Scientist, Strategy & Insights",LinkedIn is the world’s largest professional n...,Over 200 applicants
4,2 days ago,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,Over 200 applicants
...,...,...,...,...,...
150,4 days ago,Verily,Data Scientist Evaluation Intern,Who We AreVerily is a subsidiary of Alphabet t...,Over 200 applicants
151,5 days ago,Bosch USA,AI Research Scientist – GenAI,Company DescriptionThe Bosch Research and Tech...,
152,4 days ago,Scion Staffing,AI/ML Engineer,Scion Technology Staffing has been engaged to ...,
153,4 days ago,Verily,Data Scientist Evaluation Intern,Who We AreVerily is a subsidiary of Alphabet t...,Over 200 applicants
