In [None]:
#Libraries needed
import requests
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [49]:
#Libraries needed part 2
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:

SERP_API_KEY = "087c3cfc8cd5162606b894ff06d1ed3b7c34fd4accc7d6e56775ef73dcd6dc41"

# Webscrpaing Function
def fetch_serpapi_jobs_v2(query, location="California", pages=10):
    job_listings = []
    next_token = None

    for i in range(pages):
        print(f"🔎 Fetching page {i+1} of {pages} for '{query}'...")
        params = {
            "engine": "google_jobs",
            "q": query,
            "location": location,
            "api_key": SERP_API_KEY
        }
        if next_token:
            params["next_page_token"] = next_token

        response = requests.get("https://serpapi.com/search", params=params)
        results = response.json()

        jobs = results.get("jobs_results", [])
        if not jobs:
            print(f"⚠️ No jobs returned on page {i+1}")
            break

        for job in jobs:
            job_listings.append({
                "title": job.get("title"),
                "company": job.get("company_name"),
                "location": job.get("location"),
                "description": job.get("description"),
                "role": query
            })

        next_token = results.get("serpapi_pagination", {}).get("next_page_token")
        if not next_token:
            print("🚫 No next_page_token available — stopping early.")
            break

    print(f"✅ Total '{query}' jobs collected: {len(job_listings)}\n")
    return pd.DataFrame(job_listings)

In [None]:
# Fetch up to 200 jobs (10 pages)
df_jobs = fetch_serpapi_jobs_v2("data analyst", "California", pages=10)

# df_jobs is for data analyst job in CA
df_jobs.head()

In [None]:
# Software Engineer job in CA
df_software_engineer = fetch_serpapi_jobs_v2("software engineer", "California", pages=10)

🔎 Fetching page 1 of 10...
🔧 First page response:
{'search_metadata': {'id': '6850e84a69819d4482d2c707', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/5ffe16477b27ed4a/6850e84a69819d4482d2c707.json', 'created_at': '2025-06-17 04:00:10 UTC', 'processed_at': '2025-06-17 04:00:10 UTC', 'google_jobs_url': 'https://www.google.com/search?q=software+engineer&uule=w+CAIQICIYQ2FsaWZvcm5pYSxVbml0ZWQgU3RhdGVz&udm=8', 'raw_html_file': 'https://serpapi.com/searches/5ffe16477b27ed4a/6850e84a69819d4482d2c707.html', 'total_time_taken': 1.85}, 'search_parameters': {'q': 'software engineer', 'engine': 'google_jobs', 'location_requested': 'California', 'location_used': 'California,United States', 'google_domain': 'google.com'}, 'filters': [{'name': 'Remote', 'parameters': {'uds': 'AOm0WdE2fekQnsyfYEw8JPYozOKz-xywMEu-8-IR2WDrKAvs-ID4VWZh0s0c-CqDMhlveoRRJifJeHyrzUFggYf9pZ90qdQYGsdGR8xRf4oMkzUfNdyzR0Lh5xsSHylR4ew7jgj59Tf-QTlz7IBgSCatqgvrBZ4vQfUkiKd3ar2V9dcvfJIks78', 'q': 'software engi

In [None]:
# Accountant job in CA
df_accountant = fetch_serpapi_jobs_v2("accountant", "California", pages=10)

🔎 Fetching page 1 of 10...
🔧 First page response:
{'search_metadata': {'id': '6850e86dd868dd171a6b9450', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/aeca25e5ec1f41bf/6850e86dd868dd171a6b9450.json', 'created_at': '2025-06-17 04:00:45 UTC', 'processed_at': '2025-06-17 04:00:45 UTC', 'google_jobs_url': 'https://www.google.com/search?q=accountant&uule=w+CAIQICIYQ2FsaWZvcm5pYSxVbml0ZWQgU3RhdGVz&udm=8', 'raw_html_file': 'https://serpapi.com/searches/aeca25e5ec1f41bf/6850e86dd868dd171a6b9450.html', 'total_time_taken': 1.29}, 'search_parameters': {'q': 'accountant', 'engine': 'google_jobs', 'location_requested': 'California', 'location_used': 'California,United States', 'google_domain': 'google.com'}, 'filters': [{'name': 'Salary', 'parameters': {'uds': 'AOm0WdE2fekQnsyfYEw8JPYozOKzr38oZxHxP9RopcsEYRxEIdsha8e_4d-50ZjinnSFz9RbS0GLQTMPFeOcuL7aJ2zY8Qigwt3xQfjj_kUz4byTVg3M8sRiZiii3d-pk4wv5BExax44JjCa8N6WxsK73xpSen0_y1XDzbm3E7kh665Np40k2YzIKsovKPpvzBxtiWieRTBR', 'q': 'Accoun

In [None]:
df_software_engineer["role"] = "software engineer"
df_accountant["role"] = "accountant"
df_jobs["role"] = "data analyst"

In [None]:
# Combine all datasets
df_combined = pd.concat([df_software_engineer, df_accountant, df_jobs], ignore_index=True)

In [None]:
# Step 1 Cleaning and Normalizing Data
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # remove line breaks
    text = re.sub(r'[^a-zA-Z ]', '', text)  # remove punctuation/numbers
    tokens = text.lower().split() # lowercasing
    tokens = [word for word in tokens if word not in stop_words] #stopword removal
    return ' '.join(tokens)

In [None]:
# Step 1 Cleaning and Normalizing Data
df_combined['clean_description'] = df_combined['description'].apply(clean_text)

In [50]:
# Step 2 Tokenize the cleaned job descriptions
df_combined['tokens'] = df_combined['clean_description'].apply(word_tokenize)

In [51]:
# Step 3 Basic Descriptive Stats
def descriptive_stats(tokens, num_tokens_to_show = 5, verbose=True) :

    if not tokens:
        if verbose:
            print("Token list is empty.")
        return [0, 0, 0.0, 0]

    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))
    num_characters = sum(len(token) for token in tokens)
    lexical_diversity = num_unique_tokens / num_tokens if num_tokens > 0 else 0.0

    if verbose:
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"\nTop {num_tokens_to_show} most common tokens:")
        for token, count in Counter(tokens).most_common(num_tokens_to_show):
            print(f"{token}: {count}")

    return [num_tokens, num_unique_tokens, lexical_diversity, num_characters]

In [52]:
# Step 3 to show descriptive stat
for role, group in df_combined.groupby('role'):
    all_tokens = [token for tokens in group['tokens'] for token in tokens]
    print(f"\n📊 Descriptive Stats for Role: {role.upper()}")
    descriptive_stats(all_tokens, num_tokens_to_show=10)


📊 Descriptive Stats for Role: ACCOUNTANT
There are 36571 tokens in the data.
There are 4602 unique tokens in the data.
There are 278939 characters in the data.
The lexical diversity is 0.126 in the data.

Top 10 most common tokens:
accounting: 680
financial: 499
accountant: 369
experience: 343
work: 254
including: 203
skills: 196
reporting: 169
related: 168
accounts: 162

📊 Descriptive Stats for Role: DATA ANALYST
There are 36043 tokens in the data.
There are 4890 unique tokens in the data.
There are 269862 characters in the data.
The lexical diversity is 0.136 in the data.

Top 10 most common tokens:
data: 1147
experience: 359
business: 305
work: 252
analysis: 241
skills: 206
analyst: 191
ability: 181
analytics: 173
support: 166

📊 Descriptive Stats for Role: SOFTWARE ENGINEER
There are 46117 tokens in the data.
There are 5632 unique tokens in the data.
There are 346105 characters in the data.
The lexical diversity is 0.122 in the data.

Top 10 most common tokens:
experience: 684
sof