# Job Database Analysis

This notebook reads the LinkedIn job database and displays the top 10 jobs with analysis.


In [1]:
# Import required libraries
import sqlite3
import pandas as pd
from pathlib import Path
import sys
from datetime import datetime

# Add project root to path
project_root = (
    Path(__file__).parent.parent if "__file__" in globals() else Path.cwd().parent
)
sys.path.append(str(project_root))

from genai_job_finder.linkedin_parser.database import DatabaseManager
from genai_job_finder.linkedin_parser.models import Job, JobRun

In [2]:
project_root

PosixPath('/home/alireza/projects/genai_job_finder')

In [3]:
# Initialize database connection
db_path = project_root / "data" / "jobs.db"
# db_path = project_root / "test_jobs.db"

print(f"Database path: {db_path}")
print(f"Database exists: {db_path.exists()}")

# Create database manager
db = DatabaseManager(str(db_path))

Database path: /home/alireza/projects/genai_job_finder/data/jobs.db
Database exists: True


In [4]:
# Check database contents - get basic stats
with sqlite3.connect(db_path) as conn:
    # Count total jobs
    total_jobs = pd.read_sql_query("SELECT COUNT(*) as count FROM jobs", conn).iloc[0][
        "count"
    ]
    print(f"Total jobs in database: {total_jobs}")

    # Count job runs
    total_runs = pd.read_sql_query("SELECT COUNT(*) as count FROM job_runs", conn).iloc[
        0
    ]["count"]
    print(f"Total job runs: {total_runs}")

    # Show recent runs
    if total_runs > 0:
        recent_runs = pd.read_sql_query(
            """
            SELECT id, search_query, location_filter, status, job_count, created_at 
            FROM job_runs 
            ORDER BY created_at DESC 
            LIMIT 5
        """,
            conn,
        )
        print("\nRecent job runs:")
recent_runs

Total jobs in database: 865
Total job runs: 19

Recent job runs:


Unnamed: 0,id,search_query,location_filter,status,job_count,created_at
0,19,data scientist,United State,completed,180,2025-08-21 03:44:40
1,18,data engieer,Remote,completed,180,2025-08-21 03:42:39
2,17,data scientist,san antonio,completed,120,2025-08-21 03:37:35
3,16,data scientist,san antonio,completed,120,2025-08-21 03:37:18
4,15,senior data scientist,san antonio,completed,14,2025-08-21 03:37:06


In [9]:
# Get top 10 most recent jobs
with sqlite3.connect(db_path) as conn:
    query = """
    SELECT 
        job_id,
        title,
        company,
        location,
        salary_range,
        job_type,
        experience_level,
        remote_option,
        easy_apply,
        posted_date,
        created_at,
        linkedin_url
    FROM jobs 
    --where company = "USAA"
    ORDER BY created_at DESC 
    LIMIT 10
    """

    top_jobs_df = pd.read_sql_query(query, conn)

print(f"Top 100 most recent jobs:")
print(f"Shape: {top_jobs_df.shape}")
top_jobs_df.head(100)

Top 100 most recent jobs:
Shape: (10, 12)


Unnamed: 0,job_id,title,company,location,salary_range,job_type,experience_level,remote_option,easy_apply,posted_date,created_at,linkedin_url
0,4288967186,GEN AI Sr Data Scientist/Data Scientist,The Hartford,"Columbus, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/gen-ai-sr-d...
1,4283651144,Lead Data Scientist,BMW Group US,"Columbus, OH",,,senior,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/lead-data-s...
2,4285754650,AI Software Engineer,"V-Soft Consulting Group, Inc.","Cincinnati, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/ai-software...
3,4264537575,Data Scientist,Booz Allen Hamilton,"Dayton, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/data-scient...
4,4288324024,Data Scientist - R&D Impact Portfolio Manageme...,Owens Corning,"Granville, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/data-scient...
5,4287962292,IT Data Scientist - Level 2,Cintas,"Mason, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/it-data-sci...
6,4285049867,Associate Scientist/Engineer,Procter & Gamble,"Mason, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/associate-s...
7,4288937519,Applied AI Engineer & Researcher,Speechify,"Columbus, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/applied-ai-...
8,4285052755,Associate Scientist/Engineer,Procter & Gamble,"St Bernard, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/associate-s...
9,4288940227,"AI Engineer & Researcher, Inference",Speechify,"Columbus, OH",,,,0,0,,2025-08-21 03:44:42,https://www.linkedin.com/jobs/view/ai-engineer...


In [6]:
# Display detailed information for each job
if not top_jobs_df.empty:
    print("=" * 80)
    print("DETAILED JOB LISTINGS")
    print("=" * 80)

    for idx, job in top_jobs_df.iterrows():
        print(f"\n📋 JOB #{idx + 1}")
        print(f"Title: {job['title']}")
        print(f"Company: {job['company']}")
        print(f"Location: {job['location']}")

        if job["salary_range"]:
            print(f"💰 Salary: {job['salary_range']}")

        if job["job_type"]:
            print(f"📝 Type: {job['job_type']}")

        if job["experience_level"]:
            print(f"🎯 Level: {job['experience_level']}")

        if job["remote_option"]:
            print(f"🏠 Remote: {'Yes' if job['remote_option'] else 'No'}")

        if job["easy_apply"]:
            print(f"⚡ Easy Apply: {'Yes' if job['easy_apply'] else 'No'}")

        if job["posted_date"]:
            print(f"📅 Posted: {job['posted_date']}")

        if job["linkedin_url"]:
            print(f"🔗 URL: {job['linkedin_url']}")

        print("-" * 60)
else:
    print("No jobs found in database. Run the parser first to collect job data.")

DETAILED JOB LISTINGS

📋 JOB #1
Title: GEN AI Sr Data Scientist/Data Scientist
Company: The Hartford
Location: Columbus, OH
🔗 URL: https://www.linkedin.com/jobs/view/gen-ai-sr-data-scientist-data-scientist-at-the-hartford-4288967186?position=12&pageNum=0&refId=bTXgIo0ZzvCjvYZZJRnIJA%3D%3D&trackingId=PPn5DASjpRLpxXE81iycYw%3D%3D
------------------------------------------------------------

📋 JOB #2
Title: Lead Data Scientist
Company: BMW Group US
Location: Columbus, OH
🎯 Level: senior
🔗 URL: https://www.linkedin.com/jobs/view/lead-data-scientist-at-bmw-group-us-4283651144?position=13&pageNum=0&refId=bTXgIo0ZzvCjvYZZJRnIJA%3D%3D&trackingId=X7rofxEDX3yKzvJmI0Y%2BZw%3D%3D
------------------------------------------------------------

📋 JOB #3
Title: AI Software Engineer
Company: V-Soft Consulting Group, Inc.
Location: Cincinnati, OH
🔗 URL: https://www.linkedin.com/jobs/view/ai-software-engineer-at-v-soft-consulting-group-inc-4285754650?position=14&pageNum=0&refId=bTXgIo0ZzvCjvYZZJRnIJA%3D%3

In [7]:
# Job statistics and insights
if not top_jobs_df.empty:
    print("📊 JOB STATISTICS")
    print("=" * 50)

    # Company distribution
    company_counts = top_jobs_df["company"].value_counts()
    print(f"\n🏢 Top Companies in Results:")
    for company, count in company_counts.head().items():
        print(f"  • {company}: {count} job(s)")

    # Location distribution
    location_counts = top_jobs_df["location"].value_counts()
    print(f"\n📍 Top Locations:")
    for location, count in location_counts.head().items():
        print(f"  • {location}: {count} job(s)")

    # Job type distribution
    if "job_type" in top_jobs_df.columns:
        job_type_counts = top_jobs_df["job_type"].value_counts(dropna=True)
        if not job_type_counts.empty:
            print(f"\n💼 Job Types:")
            for job_type, count in job_type_counts.items():
                print(f"  • {job_type}: {count} job(s)")

    # Remote options
    remote_count = (
        top_jobs_df["remote_option"].sum()
        if "remote_option" in top_jobs_df.columns
        else 0
    )
    print(
        f"\n🏠 Remote Jobs: {remote_count} out of {len(top_jobs_df)} ({remote_count/len(top_jobs_df)*100:.1f}%)"
    )

    # Easy apply
    easy_apply_count = (
        top_jobs_df["easy_apply"].sum() if "easy_apply" in top_jobs_df.columns else 0
    )
    print(
        f"⚡ Easy Apply Jobs: {easy_apply_count} out of {len(top_jobs_df)} ({easy_apply_count/len(top_jobs_df)*100:.1f}%)"
    )

📊 JOB STATISTICS

🏢 Top Companies in Results:
  • Procter & Gamble: 2 job(s)
  • Speechify: 2 job(s)
  • The Hartford: 1 job(s)
  • BMW Group US: 1 job(s)
  • Booz Allen Hamilton: 1 job(s)

📍 Top Locations:
  • Columbus, OH: 4 job(s)
  • Mason, OH: 2 job(s)
  • Cincinnati, OH: 1 job(s)
  • Dayton, OH: 1 job(s)
  • Granville, OH: 1 job(s)

🏠 Remote Jobs: 0 out of 10 (0.0%)
⚡ Easy Apply Jobs: 0 out of 10 (0.0%)


In [8]:
# Optional: Get jobs with salary information
with sqlite3.connect(db_path) as conn:
    salary_query = """
    SELECT title, company, salary_range, location
    FROM jobs 
    WHERE salary_range IS NOT NULL AND salary_range != ''
    ORDER BY created_at DESC
    LIMIT 10
    """

    salary_jobs = pd.read_sql_query(salary_query, conn)

if not salary_jobs.empty:
    print("💰 JOBS WITH SALARY INFORMATION")
    print("=" * 50)
    for idx, job in salary_jobs.iterrows():
        print(f"{idx+1}. {job['title']} at {job['company']}")
        print(f"   💰 {job['salary_range']} | 📍 {job['location']}")
        print()
else:
    print("No jobs with salary information found.")

No jobs with salary information found.
