# Job Database Analysis

This notebook reads the LinkedIn job database and displays the top 10 jobs with analysis.


In [2]:
# Import required libraries
import sqlite3
import pandas as pd
from pathlib import Path
import sys
from datetime import datetime

# Add project root to path
project_root = (
    Path(__file__).parent.parent if "__file__" in globals() else Path.cwd().parent
)
sys.path.append(str(project_root))

from linkedin_parser.database import DatabaseManager
from linkedin_parser.models import Job, JobRun

In [3]:
project_root

PosixPath('/home/alireza/projects/genai_job_finder')

In [4]:
# Initialize database connection
db_path = project_root / "data" / "jobs.db"
# db_path = project_root / "test_jobs.db"

print(f"Database path: {db_path}")
print(f"Database exists: {db_path.exists()}")

# Create database manager
db = DatabaseManager(str(db_path))

Database path: /home/alireza/projects/genai_job_finder/data/jobs.db
Database exists: True


In [12]:
# Check database contents - get basic stats
with sqlite3.connect(db_path) as conn:
    # Count total jobs
    total_jobs = pd.read_sql_query("SELECT COUNT(*) as count FROM jobs", conn).iloc[0][
        "count"
    ]
    print(f"Total jobs in database: {total_jobs}")

    # Count job runs
    total_runs = pd.read_sql_query("SELECT COUNT(*) as count FROM job_runs", conn).iloc[
        0
    ]["count"]
    print(f"Total job runs: {total_runs}")

    # Show recent runs
    if total_runs > 0:
        recent_runs = pd.read_sql_query(
            """
            SELECT id, search_query, location_filter, status, job_count, created_at 
            FROM job_runs 
            ORDER BY created_at DESC 
            LIMIT 5
        """,
            conn,
        )
        print("\nRecent job runs:")
recent_runs

Total jobs in database: 191
Total job runs: 6

Recent job runs:


Unnamed: 0,id,search_query,location_filter,status,job_count,created_at
0,6,senior data scientist,San Antonio,completed,12,2025-08-18 04:33:58
1,5,senior data scientist,San Antonio,completed,4,2025-08-18 04:22:55
2,4,senior data scientis,remote,completed,59,2025-08-18 04:10:57
3,3,Machine Learning Engineer,united states (Remote),completed,180,2025-08-18 03:38:31
4,2,senior data scientist,united states (Remote),completed,180,2025-08-18 03:38:19


In [6]:
# Get top 10 most recent jobs
with sqlite3.connect(db_path) as conn:
    query = """
    SELECT 
        job_id,
        title,
        company,
        location,
        salary_range,
        job_type,
        experience_level,
        remote_option,
        easy_apply,
        posted_date,
        created_at,
        linkedin_url
    FROM jobs 
    --where company = "USAA"
    ORDER BY created_at DESC 
    LIMIT 10
    """

    top_jobs_df = pd.read_sql_query(query, conn)

print(f"Top 10 most recent jobs:")
print(f"Shape: {top_jobs_df.shape}")
top_jobs_df.head(10)

Top 10 most recent jobs:
Shape: (10, 12)


Unnamed: 0,job_id,title,company,location,salary_range,job_type,experience_level,remote_option,easy_apply,posted_date,created_at,linkedin_url
0,4286806216,Senior Data Scientist,Compri Consulting,"San Antonio, Texas Metropolitan Area",,,senior,0,0,,2025-08-18 04:33:59,https://www.linkedin.com/jobs/view/senior-data...
1,4283992413,Data Scientist - Machine Learning & AI,Kforce Inc,"San Antonio, TX",,,,0,0,,2025-08-18 04:33:59,https://www.linkedin.com/jobs/view/data-scient...
2,4257985114,Data Scientist I,Knowesis Inc.,"Lackland Air Force Base, TX",,,,0,0,,2025-08-18 04:33:59,https://www.linkedin.com/jobs/view/data-scient...
3,4284375070,Data Scientist Senior – Actuary & Analytics,USAA,"San Antonio, TX",,,senior,0,0,,2025-08-18 04:33:59,https://www.linkedin.com/jobs/view/data-scient...
4,4257985114,Data Scientist I,Knowesis Inc.,"Lackland Air Force Base, TX",,,,0,0,,2025-08-18 04:22:56,https://www.linkedin.com/jobs/view/data-scient...
5,4284375070,Data Scientist Senior – Actuary & Analytics,USAA,"San Antonio, TX",,,senior,0,0,,2025-08-18 04:22:56,https://www.linkedin.com/jobs/view/data-scient...
6,4286806216,Senior Data Scientist,Compri Consulting,"San Antonio, Texas Metropolitan Area",,,senior,0,0,,2025-08-18 04:22:55,https://www.linkedin.com/jobs/view/senior-data...
7,4283992413,Data Scientist - Machine Learning & AI,Kforce Inc,"San Antonio, TX",,,,0,0,,2025-08-18 04:22:55,https://www.linkedin.com/jobs/view/data-scient...
8,4287327087,Senior Data Architect - Big Data (AWS) - Canada,Rackspace Technology,Canada,,,senior,0,0,,2025-08-18 04:10:59,https://ca.linkedin.com/jobs/view/senior-data-...
9,4287760631,Account Technology Strategist,Microsoft,Kuwait City Metropolitan Area,,,,0,0,,2025-08-18 04:10:59,https://kw.linkedin.com/jobs/view/account-tech...


In [7]:
# Display detailed information for each job
if not top_jobs_df.empty:
    print("=" * 80)
    print("DETAILED JOB LISTINGS")
    print("=" * 80)

    for idx, job in top_jobs_df.iterrows():
        print(f"\n📋 JOB #{idx + 1}")
        print(f"Title: {job['title']}")
        print(f"Company: {job['company']}")
        print(f"Location: {job['location']}")

        if job["salary_range"]:
            print(f"💰 Salary: {job['salary_range']}")

        if job["job_type"]:
            print(f"📝 Type: {job['job_type']}")

        if job["experience_level"]:
            print(f"🎯 Level: {job['experience_level']}")

        if job["remote_option"]:
            print(f"🏠 Remote: {'Yes' if job['remote_option'] else 'No'}")

        if job["easy_apply"]:
            print(f"⚡ Easy Apply: {'Yes' if job['easy_apply'] else 'No'}")

        if job["posted_date"]:
            print(f"📅 Posted: {job['posted_date']}")

        if job["linkedin_url"]:
            print(f"🔗 URL: {job['linkedin_url']}")

        print("-" * 60)
else:
    print("No jobs found in database. Run the parser first to collect job data.")

DETAILED JOB LISTINGS

📋 JOB #1
Title: Senior Data Scientist
Company: Compri Consulting
Location: San Antonio, Texas Metropolitan Area
🎯 Level: senior
🔗 URL: https://www.linkedin.com/jobs/view/senior-data-scientist-at-compri-consulting-4286806216?position=1&pageNum=0&refId=12aCKT0iXwi5KuGKbfUGug%3D%3D&trackingId=dO27Gr0AdOgvk8lvR52fSg%3D%3D
------------------------------------------------------------

📋 JOB #2
Title: Data Scientist - Machine Learning & AI
Company: Kforce Inc
Location: San Antonio, TX
🔗 URL: https://www.linkedin.com/jobs/view/data-scientist-machine-learning-ai-at-kforce-inc-4283992413?position=2&pageNum=0&refId=12aCKT0iXwi5KuGKbfUGug%3D%3D&trackingId=KrfSomjghQQfeQtJSLqX9w%3D%3D
------------------------------------------------------------

📋 JOB #3
Title: Data Scientist I
Company: Knowesis Inc.
Location: Lackland Air Force Base, TX
🔗 URL: https://www.linkedin.com/jobs/view/data-scientist-i-at-knowesis-inc-4257985114?position=3&pageNum=0&refId=12aCKT0iXwi5KuGKbfUGug%3D%3

In [8]:
# Job statistics and insights
if not top_jobs_df.empty:
    print("📊 JOB STATISTICS")
    print("=" * 50)

    # Company distribution
    company_counts = top_jobs_df["company"].value_counts()
    print(f"\n🏢 Top Companies in Results:")
    for company, count in company_counts.head().items():
        print(f"  • {company}: {count} job(s)")

    # Location distribution
    location_counts = top_jobs_df["location"].value_counts()
    print(f"\n📍 Top Locations:")
    for location, count in location_counts.head().items():
        print(f"  • {location}: {count} job(s)")

    # Job type distribution
    if "job_type" in top_jobs_df.columns:
        job_type_counts = top_jobs_df["job_type"].value_counts(dropna=True)
        if not job_type_counts.empty:
            print(f"\n💼 Job Types:")
            for job_type, count in job_type_counts.items():
                print(f"  • {job_type}: {count} job(s)")

    # Remote options
    remote_count = (
        top_jobs_df["remote_option"].sum()
        if "remote_option" in top_jobs_df.columns
        else 0
    )
    print(
        f"\n🏠 Remote Jobs: {remote_count} out of {len(top_jobs_df)} ({remote_count/len(top_jobs_df)*100:.1f}%)"
    )

    # Easy apply
    easy_apply_count = (
        top_jobs_df["easy_apply"].sum() if "easy_apply" in top_jobs_df.columns else 0
    )
    print(
        f"⚡ Easy Apply Jobs: {easy_apply_count} out of {len(top_jobs_df)} ({easy_apply_count/len(top_jobs_df)*100:.1f}%)"
    )

📊 JOB STATISTICS

🏢 Top Companies in Results:
  • Compri Consulting: 2 job(s)
  • Kforce Inc: 2 job(s)
  • Knowesis Inc.: 2 job(s)
  • USAA: 2 job(s)
  • Rackspace Technology: 1 job(s)

📍 Top Locations:
  • San Antonio, TX: 4 job(s)
  • San Antonio, Texas Metropolitan Area: 2 job(s)
  • Lackland Air Force Base, TX: 2 job(s)
  • Canada: 1 job(s)
  • Kuwait City Metropolitan Area: 1 job(s)

🏠 Remote Jobs: 0 out of 10 (0.0%)
⚡ Easy Apply Jobs: 0 out of 10 (0.0%)


In [9]:
# Optional: Get jobs with salary information
with sqlite3.connect(db_path) as conn:
    salary_query = """
    SELECT title, company, salary_range, location
    FROM jobs 
    WHERE salary_range IS NOT NULL AND salary_range != ''
    ORDER BY created_at DESC
    LIMIT 10
    """

    salary_jobs = pd.read_sql_query(salary_query, conn)

if not salary_jobs.empty:
    print("💰 JOBS WITH SALARY INFORMATION")
    print("=" * 50)
    for idx, job in salary_jobs.iterrows():
        print(f"{idx+1}. {job['title']} at {job['company']}")
        print(f"   💰 {job['salary_range']} | 📍 {job['location']}")
        print()
else:
    print("No jobs with salary information found.")

No jobs with salary information found.
