## This is an experimental notebook to conduct analysis on data scientist positions on Linkedin

In [8]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd

## The following cell collects linkedin job data and creates a dataframe with 531 rows.

Do not rerun!! The data has been collected to a csv file 'data.csv'

In [5]:
## Creating loop to go through all pages
job_list = []
for page_num in range(1,100):
    list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=San%2BFrancisco%2BBay%2BArea&geoId=90000084&trk=public_jobs_jobs-search-bar_search-submit&start={page_num}" 
    # Getting response request from list 
    response = requests.get(list_url)

    list_data = response.text
    list_soup = BeautifulSoup(list_data, 'html.parser')
    page_jobs = list_soup.find_all("li")
    #10 jobs per page
    #print(len(page_jobs))
    
    ## Get job ID's from each page
    id_list = []

    for job in page_jobs:
        base_card_div = job.find("div", {"class": "base-card"})
        job_id = base_card_div.get("data-entity-urn").split(":")[3]
        id_list.append(job_id)
        
    # For every job with ID, get the information
    for job_id in id_list:
        job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
        job_response = requests.get(job_url)
        if(job_response.status_code == 200):
            #print(job_response.status_code)
            job_soup = BeautifulSoup(job_response.text, "html.parser")
            job_post = {}
            try:
                job_post["company_name"] = job_soup.find( "a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
            except: 
                job_post["company_name"] = None
            try:
                job_post["job_title"] = job_soup.find( "h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
            except: 
                job_post["job_title"] = None
            try:
                job_post["job_description"] = job_soup.find( "div", {"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" }).text.strip()
            except: 
                job_post["job_description"] = None
            try:
                job_post["days_ago_posted"] = job_soup.find( "span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
            except:
                job_post["days_ago_posted"] = None
            try:
                job_post["num_applicants"] = job_soup.find( "figcaption", {"class": "num-applicants__caption"}).text.strip()
            except:
                job_post["num_applicants"] = None
            job_list.append(job_post)




In [6]:
jobs_df = pd.DataFrame(job_list)
jobs_df

Unnamed: 0,company_name,job_title,job_description,days_ago_posted,num_applicants
0,Notion,"Data Scientist, Product",About UsWe're on a mission to make it possible...,1 week ago,Over 200 applicants
1,Netflix,"Machine Learning Engineer Intern, Summer 2025",Netflix is one of the world's leading entertai...,1 month ago,Over 200 applicants
2,Google,"Data Scientist, Search Discovery, Research, Se...",Minimum qualifications:Master's degree in Stat...,3 weeks ago,Over 200 applicants
3,Fractal,Data Scientist,Data ScientistFractal Analytics is a strategic...,3 days ago,Over 200 applicants
4,Fractal,Data Scientist,Data ScientistFractal Analytics is a strategic...,6 days ago,Over 200 applicants
...,...,...,...,...,...
527,SynergisticIT,: Entry Level Data Scientist,About UsSynergistic IT is a full-service staff...,4 months ago,Be among the first 25 applicants
528,SoFi,"Staff Data Scientist, Home Loans",Employee Applicant Privacy NoticeWho we are:Sh...,2 days ago,
529,Meta,"Data Scientist, Product Analytics","As a Data Scientist at Meta, you will shape th...",3 weeks ago,Be among the first 25 applicants
530,LinkedIn,Senior AI Scientist,LinkedIn is the world’s largest professional n...,2 weeks ago,Over 200 applicants


In [7]:
jobs_df.to_csv('data.csv', index=False)

## From here on we will use the data.csv file that we exported the data to

In [9]:
import os
os.getcwd() 

'/Users/toriwang/Documents/GitHub/WebScraping-Projects'

In [10]:
import sys
print(sys.executable)

/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10


In [11]:
jobs_df = pd.read_csv('job_data.csv') 

In [23]:
print(type(jobs_df))

<class 'pandas.core.frame.DataFrame'>


In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import ngrams
import nltk.collocations
from nltk import BigramCollocationFinder
from nltk.probability import FreqDist

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [13]:
stop_words = set(stopwords.words("english"))
stop_words.add(':')
stop_words.add(',')
stop_words.add('.')
stop_words.add(', ')
stop_words.add('. ')
stop_words.add('*')
stop_words.add("'")
stop_words.add("'s")
stop_words.add('e.g')
stop_words.add('employees')
stop_words.add('applicants')
stop_words.add(')')
stop_words.add('(')
stop_words.add('eligible')
stop_words.add('participate')

In [14]:

def get_n_grams(word_list, n, top_n):
    '''
        Function to return ngrams
        word_list: pass lowercase word list (filtered for stop words)
        n: the number of words in each phrase (gram)
        top_n: top n number of matches 
    '''
    grams = list(ngrams(word_list, n))
    freq_dist = FreqDist(grams)
    
    topn = freq_dist.most_common(top_n)
    return(topn)


In [34]:
## In this cell I'm going to add the list of common tokens to the dataframe

bigram_list = []
trigram_list = []

#Create a dictionary of the most common bigrams
bigram_dict = {}
trigram_dict = {}

#for every job in job_list dataframe
for i in range(len(jobs_df)): # change to len(jobs_df)
    # print(jobs_df['job_description'][i], '\n')
    # Grab the job description
    text = jobs_df['job_description'][i]

    #Tokenize and filter unnecessary words, and put relevant words i
    tokens = word_tokenize(text)
    filtered_list = []
    for word in tokens:
        if word.casefold() not in stop_words:
            filtered_list.append(word.lower())  
    
    # For bigrams
    # add bigrams
    top8 = get_n_grams(filtered_list, 2, 8)
    bigram_list.append(top8)
    for tup in top8:
        if tup[0] not in bigram_dict:
            bigram_dict[tup[0]] = tup[1]
        else:
            bigram_dict[tup[0]] += tup[1]

    # add trigrams
    top5 = get_n_grams(filtered_list, 3, 2)
    bigram_list.append(top5)
    for tup in top5:
        if tup[0] not in bigram_dict:
            trigram_dict[tup[0]] = tup[1]
        else:
            trigram_dict[tup[0]] += tup[1]


sorted_dict_bigram = dict(sorted(bigram_dict.items(), key=lambda item: item[1],  reverse=True))
sorted_dict_trigram = dict(sorted(trigram_dict.items(), key=lambda item: item[1],  reverse=True))

In [39]:
print(len(sorted_dict_bigram))
print(sorted_dict_bigram)

648
{('machine', 'learning'): 1010, ('data', 'science'): 312, ('team', 'members'): 150, ('data', 'scientist'): 137, ('equal', 'opportunity'): 95, ('around', 'world'): 94, ('shape', 'future'): 88, ('products', 'build'): 88, ('digital', 'experiences'): 86, ('ml', 'models'): 83, ('data', 'sets'): 76, ('billions', 'people'): 74, ('people', 'hundreds'): 74, ('hundreds', 'millions'): 74, ('learning', 'models'): 66, ('deep', 'learning'): 66, ('wide', 'array'): 64, ('years', 'experience'): 61, ('learning', 'algorithms'): 53, ('cross-functional', 'teams'): 53, ('tecton', '’'): 52, ('use', 'cases'): 52, ('generative', 'ai'): 52, ('inspire', 'creativity'): 51, ('creativity', 'bring'): 51, ('bring', 'joy'): 51, ('los', 'angeles'): 51, ('adobe', '’'): 51, ('data', 'analysis'): 49, ('data', 'scientists'): 49, ('paid', 'time'): 48, ('help', 'us'): 43, ('skills', 'experience'): 42, ('’', 'genai'): 39, ('genai', 'use'): 39, ('world', 'digital'): 37, ('experiences', 'adobe'): 37, ('’', 'give'): 37, ('ye

# Archive NLP attempts

In [16]:
import nltk
from nltk import word_tokenize, ngrams
from collections import Counter

In [14]:
def find_common_phrases(df, column_name, n=2):
    """
    Finds the most common phrases in a dataframe column.

    Args:
        df (pd.DataFrame): The dataframe.
        column_name (str): The name of the column containing text.
        n (int): The number of words in a phrase (default: 2).

    Returns:
        Counter: A Counter object containing the most common phrases and their counts.
    """

    all_phrases = []

    for text in df[column_name]:
        words = word_tokenize(text)
        phrases = ngrams(words, n)
        all_phrases.extend(phrases)

    return Counter(all_phrases).most_common()


In [16]:

# Example usage:
common_phrases = find_common_phrases(jobs_df, 'job_description', 3)
print(common_phrases)

[((',', 'you', 'will'), 11), (('is', 'committed', 'to'), 10), ((',', 'color', ','), 9), ((',', 'religion', ','), 9), ((',', 'national', 'origin'), 9), (('national', 'origin', ','), 9), ((',', 'sexual', 'orientation'), 9), (('sexual', 'orientation', ','), 9), (('.', 'If', 'you'), 8), ((',', 'age', ','), 8), (('marital', 'status', ','), 7), (('veteran', 'status', ','), 7), (('for', 'this', 'role'), 7), (('range', 'for', 'this'), 7), (('in', 'Machine', 'Learning'), 7), (('not', 'limited', 'to'), 7), (('race', ',', 'color'), 6), ((',', 'marital', 'status'), 6), ((',', 'gender', 'identity'), 6), (('gender', 'identity', 'or'), 6), (('in', 'accordance', 'with'), 6), ((',', 'sex', ','), 6), (('.', 'You', 'will'), 6), (('working', 'with', 'large'), 6), (('participate', 'in', 'the'), 6), ((':', 'https', ':'), 6), (('employees', 'or', 'applicants'), 6), (('an', 'equal', 'opportunity'), 5), (('color', ',', 'religion'), 5), ((',', 'veteran', 'status'), 5), (('identity', 'or', 'expression'), 5), (('