# Enhanced LinkedIn Job Database Analysis

This notebook analyzes the LinkedIn job database with the new enhanced parser that includes:

- **20-column output structure** (with integrated company information)
- **Company intelligence** with automatic extraction of company size, followers, and industry
- **Location intelligence** with automatic extraction
- **Work type classification** (Remote/Hybrid/On-site)
- **Enhanced data model** with comprehensive job and company information

Run `make run-parser` first to collect fresh job data with location and company intelligence.


In [37]:
# Import required libraries
import sqlite3
import pandas as pd
from pathlib import Path
import sys
from datetime import datetime

# Add project root to path
project_root = (
    Path(__file__).parent.parent if "__file__" in globals() else Path.cwd().parent
)
sys.path.append(str(project_root))

from genai_job_finder.linkedin_parser.database import DatabaseManager
from genai_job_finder.linkedin_parser.models import Job, JobRun

In [38]:
project_root

PosixPath('/home/alireza/projects/genai_job_finder')

In [39]:
# Initialize database connection
db_path = project_root / "data" / "jobs.db"
# db_path = project_root / "test_jobs.db"

print(f"Database path: {db_path}")
print(f"Database exists: {db_path.exists()}")

# Create database manager
db = DatabaseManager(str(db_path))

Database path: /home/alireza/projects/genai_job_finder/data/jobs.db
Database exists: True


In [40]:
# Check database contents - get basic stats
with sqlite3.connect(db_path) as conn:
    # Count total jobs
    total_jobs = pd.read_sql_query("SELECT COUNT(*) as count FROM jobs", conn).iloc[0][
        "count"
    ]
    print(f"Total jobs in database: {total_jobs}")

    # Count job runs
    total_runs = pd.read_sql_query("SELECT COUNT(*) as count FROM job_runs", conn).iloc[
        0
    ]["count"]
    print(f"Total job runs: {total_runs}")

    # Show recent runs
    if total_runs > 0:
        recent_runs = pd.read_sql_query(
            """
            SELECT id, search_query, location_filter, status, job_count, created_at 
            FROM job_runs 
            ORDER BY created_at DESC 
            LIMIT 5
        """,
            conn,
        )
        print("\nRecent job runs:")
recent_runs

Total jobs in database: 256
Total job runs: 23

Recent job runs:


Unnamed: 0,id,search_query,location_filter,status,job_count,created_at
0,23,data scientist,San Antonio,completed,9,2025-09-01 04:32:27
1,22,data scientist,San Antonio,completed,9,2025-09-01 04:22:58
2,21,data scientist,San Antonio,completed,8,2025-08-31 23:18:11
3,20,data scientist,San Antonio,pending,0,2025-08-31 23:13:38
4,19,data scientist,San Antonio,completed,8,2025-08-31 23:07:50


In [42]:
# Get top 20 most recent jobs with enhanced data structure including company information
with sqlite3.connect(db_path) as conn:
    query = """
    SELECT 
        id,
        company,
        company_size,
        company_followers,
        company_industry,
        title,
        location,
        work_location_type,
        level,
        salary_range,
        employment_type,
        job_function,
        industries,
        posted_time,
        applicants,
        job_id,
        date,
        parsing_link,
        job_posting_link,
        created_at
    FROM jobs 
    ORDER BY created_at DESC 
    LIMIT 20
    """

    top_jobs_df = pd.read_sql_query(query, conn)

print(f"📊 Enhanced Job Data Analysis with Company Intelligence")
print(f"Database contains: {len(top_jobs_df)} recent jobs")
print(f"Columns: {top_jobs_df.shape[1]} (20-column structure with company info)")
print(f"\nColumn names: {list(top_jobs_df.columns)}")
top_jobs_df.head(20)

📊 Enhanced Job Data Analysis with Company Intelligence
Database contains: 20 recent jobs
Columns: 20 (20-column structure with company info)

Column names: ['id', 'company', 'company_size', 'company_followers', 'company_industry', 'title', 'location', 'work_location_type', 'level', 'salary_range', 'employment_type', 'job_function', 'industries', 'posted_time', 'applicants', 'job_id', 'date', 'parsing_link', 'job_posting_link', 'created_at']


Unnamed: 0,id,company,company_size,company_followers,company_industry,title,location,work_location_type,level,salary_range,employment_type,job_function,industries,posted_time,applicants,job_id,date,parsing_link,job_posting_link,created_at
0,5c3aec75-f58d-45a1-af1f-9eb7901f2281,Jobs via Dice,1 employees,173 followers,,Lead,"San Antonio, TX",On-site,Mid-Senior level,$45.00/hr - $60.00/hr,Full-time,Management,Business Consulting and Services,4 hours ago,,4293521357,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/lead-at-job...,2025-09-01 04:34:04
1,d95f8708-f301-4e45-a27e-3abc2e782dcc,Amazon Web Services (AWS),492 employees,363 followers,,"Cleared Data Center Mechanical Field Engineer,...","San Antonio, TX",On-site,Not Applicable,,Full-time,"Information Technology, Consulting, and Engine...",IT Services and IT Consulting,15 hours ago,37 applicants,4259080606,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/cleared-dat...,2025-09-01 04:33:56
2,38454989-183a-47d4-8fb8-7d7dea095fdf,Aha!,,,,Sr. Platform Engineer,"San Antonio, TX",Remote,Mid-Senior level,,Full-time,Engineering and Information Technology,Software Development,22 hours ago,,4293451536,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/sr-platform...,2025-09-01 04:33:47
3,a1f79df7-e2f3-4eca-8e0a-b42b889d2937,Jobs via Dice,1 employees,173 followers,,Web Developer,"San Antonio, TX",On-site,Entry level,,Full-time,Information Technology,Software Development,22 hours ago,,4293366035,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/web-develop...,2025-09-01 04:33:36
4,63c3f731-bb47-45fc-b5fb-4085c19f2b6e,CyrusOne,"501-1,000 employees","52,863 followers",IT Services and IT Consulting,Senior Data Center Capacity Engineer,"San Antonio, TX",Hybrid,Mid-Senior level,,Full-time,Information Technology,IT Services and IT Consulting,18 hours ago,,4267490741,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/senior-data...,2025-09-01 04:33:25
5,07586e9e-7e8a-4244-b986-1d414f5dec28,Lensa,,,,"Internships in Data Science, Math, Statistics ...","San Antonio, TX",Remote,Internship,,Internship,Engineering and Information Technology,Internet Publishing,22 hours ago,,4293448846,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/internships...,2025-09-01 04:33:15
6,459f6ae9-18df-450f-96fa-fd265d00ca61,Jobs via Dice,1 employees,173 followers,,Senior Back-End Developer,"San Antonio, TX",On-site,Mid-Senior level,"$145,000.00/yr - $170,000.00/yr",Full-time,Information Technology,Software Development,12 hours ago,,4291579961,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/senior-back...,2025-09-01 04:33:03
7,f2fc0d6d-7f62-4e1d-bf91-55185c6b71d3,Aha!,,,,Sr. Security Engineer (Ruby on Rails experienc...,"San Antonio, TX",Remote,Mid-Senior level,,Full-time,Information Technology,Software Development,22 hours ago,,4293443790,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/sr-security...,2025-09-01 04:32:52
8,78b052e8-f1a9-4d76-ba20-e37c9206f328,Jobot,"501-1,000 employees","3,308,869 followers",Staffing and Recruiting,REMOTE Sr. Java Backend Developer (Recent heal...,"San Antonio, TX",Remote,Not Applicable,"$160,000.00/yr - $200,000.00/yr",Full-time,Engineering and Information Technology,"Software Development, Technology, Information ...",13 hours ago,,4291714459,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/remote-sr-j...,2025-09-01 04:32:40
9,b8f23846-f582-4a3f-ae8b-c468cf5f575c,Lensa,,,,"Internships in Data Science, Math, Statistics ...","San Antonio, TX",Remote,Internship,,Internship,Engineering and Information Technology,Internet Publishing,22 hours ago,,4293448846,2025-08-31,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/internships...,2025-09-01 04:24:28


In [31]:
# Display detailed information for each job with enhanced data including company info (limited output)
if not top_jobs_df.empty:
    print("=" * 80)
    print("ENHANCED JOB LISTINGS WITH LOCATION & COMPANY INTELLIGENCE")
    print("=" * 80)

    # Limit to first 5 jobs to prevent excessive output
    display_limit = min(5, len(top_jobs_df))
    print(f"Showing first {display_limit} of {len(top_jobs_df)} jobs:\n")

    for idx in range(display_limit):
        job = top_jobs_df.iloc[idx]
        print(f"📋 JOB #{idx + 1}")
        print(f"Title: {job['title']}")
        print(f"Company: {job['company']}")

        # NEW: Company information display
        company_info = []
        if pd.notna(job["company_size"]) and job["company_size"]:
            company_info.append(f"👥 Size: {job['company_size']}")
        if pd.notna(job["company_followers"]) and job["company_followers"]:
            company_info.append(f"📊 Followers: {job['company_followers']}")
        if pd.notna(job["company_industry"]) and job["company_industry"]:
            company_info.append(f"🏭 Industry: {job['company_industry']}")

        if company_info:
            print(f"🏢 Company Info: {' | '.join(company_info)}")

        # Enhanced location information
        if pd.notna(job["location"]) and job["location"]:
            print(f"📍 Location: {job['location']}")

        if pd.notna(job["work_location_type"]) and job["work_location_type"]:
            # Use emoji for work type
            work_type_emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}
            emoji = work_type_emoji.get(job["work_location_type"], "📍")
            print(f"{emoji} Work Type: {job['work_location_type']}")

        if pd.notna(job["level"]) and job["level"]:
            print(f"🎯 Level: {job['level']}")

        if pd.notna(job["salary_range"]) and job["salary_range"]:
            print(f"💰 Salary: {job['salary_range']}")

        if pd.notna(job["employment_type"]) and job["employment_type"]:
            print(f"📝 Employment: {job['employment_type']}")

        if pd.notna(job["job_function"]) and job["job_function"]:
            print(f"⚙️ Function: {job['job_function']}")

        if pd.notna(job["industries"]) and job["industries"]:
            print(f"🏭 Industries: {job['industries']}")

        if pd.notna(job["applicants"]) and job["applicants"]:
            print(f"👥 Applicants: {job['applicants']}")

        if pd.notna(job["posted_time"]) and job["posted_time"]:
            print(f"📅 Posted: {job['posted_time']}")

        if pd.notna(job["job_posting_link"]) and job["job_posting_link"]:
            print(f"🔗 LinkedIn URL: {job['job_posting_link']}")

        print("-" * 60)

    if len(top_jobs_df) > display_limit:
        print(f"\n... and {len(top_jobs_df) - display_limit} more jobs in the database")
        print("💡 Tip: Run the statistics cell below for a summary of all jobs")

else:
    print("No jobs found in database. Run 'make run-parser' first to collect job data.")

ENHANCED JOB LISTINGS WITH LOCATION & COMPANY INTELLIGENCE
Showing first 5 of 20 jobs:

📋 JOB #1
Title: Senior Data Center Capacity Engineer
Company: CyrusOne
🏢 Company Info: 👥 Size: 822 employees | 📊 Followers: 52,862 followers | 🏭 Industry: CyrusOne
📍 Location: San Antonio, TX
🔄 Work Type: Hybrid
🎯 Level: Mid-Senior level
📝 Employment: Full-time
⚙️ Function: Information Technology
🏭 Industries: IT Services and IT Consulting
👥 Applicants: N/A
📅 Posted: 13 hours ago
🔗 LinkedIn URL: https://www.linkedin.com/jobs/view/senior-data-center-capacity-engineer-at-cyrusone-4267490741?trk=public_jobs_topcard-title
------------------------------------------------------------
📋 JOB #2
Title: Cleared Data Center Mechanical Field Engineer, ADC Field Engineering
Company: Amazon Web Services (AWS)
🏢 Company Info: 👥 Size: 492 employees | 📊 Followers: 363 followers
📍 Location: San Antonio, TX
🏢 Work Type: On-site
🎯 Level: Not Applicable
📝 Employment: Full-time
⚙️ Function: Information Technology, Consul

In [32]:
# Enhanced job statistics with location and company intelligence
if not top_jobs_df.empty:
    print("📊 ENHANCED JOB STATISTICS WITH LOCATION & COMPANY INTELLIGENCE")
    print("=" * 70)

    # Company distribution
    company_counts = top_jobs_df["company"].value_counts()
    print(f"\n🏢 Top Companies:")
    for company, count in company_counts.head().items():
        print(f"  • {company}: {count} job(s)")

    # NEW: Company intelligence analysis
    print(f"\n🏢 COMPANY INTELLIGENCE ANALYSIS:")

    # Company size analysis
    company_size_data = top_jobs_df["company_size"].dropna()
    if not company_size_data.empty:
        print(
            f"  👥 Company Size Info Available: {len(company_size_data)}/{len(top_jobs_df)} jobs ({len(company_size_data)/len(top_jobs_df)*100:.1f}%)"
        )
        print(f"     Sample sizes: {', '.join(company_size_data.head(3).astype(str))}")
    else:
        print(f"  👥 Company Size Info: Not available (run parser to collect)")

    # Company followers analysis
    company_followers_data = top_jobs_df["company_followers"].dropna()
    if not company_followers_data.empty:
        print(
            f"  📊 Company Followers Info: {len(company_followers_data)}/{len(top_jobs_df)} jobs ({len(company_followers_data)/len(top_jobs_df)*100:.1f}%)"
        )
        print(
            f"     Sample followers: {', '.join(company_followers_data.head(3).astype(str))}"
        )
    else:
        print(f"  📊 Company Followers Info: Not available (run parser to collect)")

    # Company industry analysis
    company_industry_data = top_jobs_df["company_industry"].dropna()
    if not company_industry_data.empty:
        print(
            f"  🏭 Company Industry Info: {len(company_industry_data)}/{len(top_jobs_df)} jobs ({len(company_industry_data)/len(top_jobs_df)*100:.1f}%)"
        )
        industry_counts = company_industry_data.value_counts().head(3)
        print(f"     Top industries: {', '.join(industry_counts.index)}")
    else:
        print(f"  🏭 Company Industry Info: Not available (run parser to collect)")

    # Location distribution (enhanced)
    location_counts = top_jobs_df["location"].value_counts()
    print(f"\n📍 Top Locations:")
    for location, count in location_counts.head().items():
        print(f"  • {location}: {count} job(s)")

    # Work location type analysis
    if "work_location_type" in top_jobs_df.columns:
        work_type_counts = top_jobs_df["work_location_type"].value_counts(dropna=True)
        print(f"\n🏠 Work Location Types (Location Intelligence):")
        for work_type, count in work_type_counts.items():
            emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}.get(
                work_type, "📍"
            )
            percentage = count / len(top_jobs_df) * 100
            print(f"  {emoji} {work_type}: {count} job(s) ({percentage:.1f}%)")

    # Experience level distribution
    if "level" in top_jobs_df.columns:
        level_counts = top_jobs_df["level"].value_counts(dropna=True)
        if not level_counts.empty:
            print(f"\n🎯 Experience Levels:")
            for level, count in level_counts.items():
                print(f"  • {level}: {count} job(s)")

    # Employment type distribution
    if "employment_type" in top_jobs_df.columns:
        employment_counts = top_jobs_df["employment_type"].value_counts(dropna=True)
        if not employment_counts.empty:
            print(f"\n💼 Employment Types:")
            for emp_type, count in employment_counts.items():
                print(f"  • {emp_type}: {count} job(s)")

    # Job function analysis
    if "job_function" in top_jobs_df.columns:
        function_counts = top_jobs_df["job_function"].value_counts(dropna=True)
        if not function_counts.empty:
            print(f"\n⚙️ Top Job Functions:")
            for function, count in function_counts.head().items():
                print(f"  • {function}: {count} job(s)")

    # Salary information availability
    salary_jobs = top_jobs_df["salary_range"].notna().sum()
    print(
        f"\n💰 Salary Information: {salary_jobs} out of {len(top_jobs_df)} jobs ({salary_jobs/len(top_jobs_df)*100:.1f}%)"
    )

    # Applicant information
    applicant_jobs = top_jobs_df["applicants"].notna().sum()
    print(
        f"👥 Applicant Count Available: {applicant_jobs} out of {len(top_jobs_df)} jobs ({applicant_jobs/len(top_jobs_df)*100:.1f}%)"
    )

    print(f"\n📈 Data Quality Summary:")
    print(f"  ✅ All jobs have location intelligence classification")
    print(f"  ✅ Enhanced 20-column data structure with company info")
    print(f"  ✅ Company intelligence extraction available")
    print(f"  ✅ Comprehensive job metadata available")

📊 ENHANCED JOB STATISTICS WITH LOCATION & COMPANY INTELLIGENCE

🏢 Top Companies:
  • Jobs via Dice: 5 job(s)
  • Aha!: 5 job(s)
  • Amazon Web Services (AWS): 3 job(s)
  • CyrusOne: 3 job(s)
  • Lensa: 3 job(s)

🏢 COMPANY INTELLIGENCE ANALYSIS:
  👥 Company Size Info Available: 11/20 jobs (55.0%)
     Sample sizes: 822 employees, 492 employees, 1 employees
  📊 Company Followers Info: 11/20 jobs (55.0%)
     Sample followers: 52,862 followers, 363 followers, 173 followers
  🏭 Company Industry Info: 3/20 jobs (15.0%)
     Top industries: CyrusOne, Lensa, Amazon Web Services (AWS)

📍 Top Locations:
  • San Antonio, TX: 20 job(s)

🏠 Work Location Types (Location Intelligence):
  🏢 On-site: 9 job(s) (45.0%)
  🏠 Remote: 8 job(s) (40.0%)
  🔄 Hybrid: 3 job(s) (15.0%)

🎯 Experience Levels:
  • Mid-Senior level: 12 job(s)
  • Not Applicable: 3 job(s)
  • Internship: 3 job(s)
  • Entry level: 2 job(s)

💼 Employment Types:
  • Full-time: 17 job(s)
  • Internship: 3 job(s)

⚙️ Top Job Functions:
  •

In [33]:
# Enhanced salary analysis with location and company intelligence
with sqlite3.connect(db_path) as conn:
    salary_query = """
    SELECT title, company, company_size, company_followers, company_industry,
           salary_range, location, work_location_type, level, employment_type
    FROM jobs 
    WHERE salary_range IS NOT NULL AND salary_range != ''
    ORDER BY created_at DESC
    LIMIT 15
    """

    salary_jobs = pd.read_sql_query(salary_query, conn)

if not salary_jobs.empty:
    print("💰 JOBS WITH SALARY INFORMATION + LOCATION & COMPANY INTELLIGENCE")
    print("=" * 75)
    for idx, job in salary_jobs.iterrows():
        # Work type emoji
        work_emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}.get(
            job["work_location_type"], "📍"
        )

        print(f"{idx+1:2d}. {job['title']} at {job['company']}")
        print(f"    💰 {job['salary_range']}")
        print(f"    📍 {job['location']} | {work_emoji} {job['work_location_type']}")

        # NEW: Company information display
        company_details = []
        if pd.notna(job["company_size"]) and job["company_size"]:
            company_details.append(f"👥 {job['company_size']} employees")
        if pd.notna(job["company_followers"]) and job["company_followers"]:
            company_details.append(f"📊 {job['company_followers']} followers")
        if pd.notna(job["company_industry"]) and job["company_industry"]:
            company_details.append(f"🏭 {job['company_industry']}")

        if company_details:
            print(f"    🏢 {' | '.join(company_details)}")

        if job["level"]:
            print(f"    🎯 {job['level']}")
        if job["employment_type"]:
            print(f"    📝 {job['employment_type']}")
        print()

    # Salary analysis by work type
    if "work_location_type" in salary_jobs.columns:
        print("📈 SALARY ANALYSIS BY WORK TYPE")
        print("=" * 40)
        work_type_salary = salary_jobs.groupby("work_location_type").size()
        for work_type, count in work_type_salary.items():
            emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}.get(
                work_type, "📍"
            )
            print(f"{emoji} {work_type}: {count} jobs with salary info")

    # NEW: Company size analysis for salary jobs
    print(f"\n🏢 COMPANY SIZE ANALYSIS FOR SALARY JOBS")
    print("=" * 45)
    company_size_salary = salary_jobs[salary_jobs["company_size"].notna()]
    if not company_size_salary.empty:
        print(
            f"💼 Jobs with both salary and company size data: {len(company_size_salary)}"
        )
        for idx, job in company_size_salary.head(5).iterrows():
            print(
                f"  • {job['company']}: {job['company_size']} employees | {job['salary_range']}"
            )
    else:
        print("📊 No jobs found with both salary and company size information")
        print(
            "💡 Run 'make run-parser' to collect fresh data with company intelligence"
        )

else:
    print("No jobs with salary information found.")

💰 JOBS WITH SALARY INFORMATION + LOCATION & COMPANY INTELLIGENCE
 1. Senior Back-End Developer at Jobs via Dice
    💰 $145,000.00/yr - $170,000.00/yr
    📍 San Antonio, TX | 🏢 On-site
    🏢 👥 1 employees employees | 📊 173 followers followers
    🎯 Mid-Senior level
    📝 Full-time

 2. BI Developer/Reporting Analyst at Piper Companies
    💰 $75,000.00/yr - $90,000.00/yr
    📍 San Antonio, TX | 🏢 On-site
    🎯 Mid-Senior level
    📝 Full-time

 3. Senior Back-End Developer at Jobs via Dice
    💰 $145,000.00/yr - $170,000.00/yr
    📍 San Antonio, TX | 🏢 On-site
    🏢 👥 1 employees employees | 📊 173 followers followers
    🎯 Mid-Senior level
    📝 Full-time

 4. Senior Back-End Developer at Jobs via Dice
    💰 $145,000.00/yr - $170,000.00/yr
    📍 San Antonio, TX | 🏢 On-site
    🏢 👥 1 employees employees | 📊 173 followers followers
    🎯 Mid-Senior level
    📝 Full-time

 5. BI Developer/Reporting Analyst at Piper Companies
    💰 $75,000.00/yr - $90,000.00/yr
    📍 San Antonio, TX | 🏢 On-s

In [24]:
# 🎯 LOCATION & COMPANY INTELLIGENCE SHOWCASE
print("🌍 LOCATION & COMPANY INTELLIGENCE ANALYSIS")
print("=" * 60)

with sqlite3.connect(db_path) as conn:
    # Get location intelligence statistics
    location_intel_query = """
    SELECT 
        location,
        work_location_type,
        COUNT(*) as job_count,
        GROUP_CONCAT(DISTINCT company) as companies,
        COUNT(CASE WHEN company_size IS NOT NULL THEN 1 END) as companies_with_size,
        COUNT(CASE WHEN company_industry IS NOT NULL THEN 1 END) as companies_with_industry
    FROM jobs 
    WHERE location IS NOT NULL
    GROUP BY location, work_location_type
    ORDER BY job_count DESC
    LIMIT 10
    """

    location_intel_df = pd.read_sql_query(location_intel_query, conn)

if not location_intel_df.empty:
    print("📊 Location + Work Type + Company Intelligence Distribution:")
    for idx, row in location_intel_df.iterrows():
        emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}.get(
            row["work_location_type"], "📍"
        )
        companies = row["companies"].split(",") if row["companies"] else []

        print(
            f"{emoji} {row['location']} - {row['work_location_type']}: {row['job_count']} jobs"
        )
        if len(companies) <= 3:
            print(f"    Companies: {', '.join(companies)}")
        else:
            print(
                f"    Companies: {', '.join(companies[:3])}... (+{len(companies)-3} more)"
            )

        # NEW: Company intelligence stats
        company_intel_info = []
        if row["companies_with_size"] > 0:
            company_intel_info.append(f"👥 {row['companies_with_size']} with size data")
        if row["companies_with_industry"] > 0:
            company_intel_info.append(
                f"🏭 {row['companies_with_industry']} with industry data"
            )

        if company_intel_info:
            print(f"    Company Intel: {' | '.join(company_intel_info)}")
        print()

    # Overall location intelligence summary
    with sqlite3.connect(db_path) as conn:
        summary_query = """
        SELECT 
            work_location_type,
            COUNT(*) as count,
            ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM jobs), 1) as percentage
        FROM jobs 
        WHERE work_location_type IS NOT NULL
        GROUP BY work_location_type
        ORDER BY count DESC
        """
        summary_df = pd.read_sql_query(summary_query, conn)

    print("🎯 WORK TYPE INTELLIGENCE SUMMARY:")
    print("-" * 40)
    for _, row in summary_df.iterrows():
        emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}.get(
            row["work_location_type"], "📍"
        )
        print(
            f"{emoji} {row['work_location_type']:8s}: {row['count']:3d} jobs ({row['percentage']:5.1f}%)"
        )

    # NEW: Company intelligence summary
    with sqlite3.connect(db_path) as conn:
        company_intel_summary = """
        SELECT 
            COUNT(*) as total_jobs,
            COUNT(CASE WHEN company_size IS NOT NULL THEN 1 END) as jobs_with_size,
            COUNT(CASE WHEN company_followers IS NOT NULL THEN 1 END) as jobs_with_followers,
            COUNT(CASE WHEN company_industry IS NOT NULL THEN 1 END) as jobs_with_industry,
            COUNT(CASE WHEN company_size IS NOT NULL AND company_followers IS NOT NULL THEN 1 END) as jobs_with_both
        FROM jobs
        """
        company_stats = pd.read_sql_query(company_intel_summary, conn).iloc[0]

    print(f"\n🏢 COMPANY INTELLIGENCE SUMMARY:")
    print("-" * 40)
    total = company_stats["total_jobs"]
    print(
        f"👥 Company Size Data:     {company_stats['jobs_with_size']:3d}/{total} jobs ({company_stats['jobs_with_size']/total*100:5.1f}%)"
    )
    print(
        f"📊 Company Followers:     {company_stats['jobs_with_followers']:3d}/{total} jobs ({company_stats['jobs_with_followers']/total*100:5.1f}%)"
    )
    print(
        f"🏭 Company Industry:      {company_stats['jobs_with_industry']:3d}/{total} jobs ({company_stats['jobs_with_industry']/total*100:5.1f}%)"
    )
    print(
        f"🎯 Complete Company Data: {company_stats['jobs_with_both']:3d}/{total} jobs ({company_stats['jobs_with_both']/total*100:5.1f}%)"
    )

    print(f"\n✨ Enhanced Intelligence Features:")
    print(f"   🎯 Automatic location extraction from job postings")
    print(f"   🤖 AI-powered work type classification")
    print(f"   🏢 Company size, followers, and industry extraction")
    print(f"   📊 Enhanced analytics with location and company data")
    print(f"   💾 20-column output with integrated company information")

else:
    print(
        "No location data found. Run 'make run-parser' to collect jobs with location & company intelligence."
    )

🌍 LOCATION & COMPANY INTELLIGENCE ANALYSIS
📊 Location + Work Type + Company Intelligence Distribution:
🏢 San Antonio, TX - On-site: 81 jobs
    Companies: VETROMAC, Inherent Technologies, SwRI Structural Geology & Geomechanics... (+12 more)
    Company Intel: 👥 9 with size data | 🏭 3 with industry data

🏠 San Antonio, TX - Remote: 65 jobs
    Companies: Raft, Mindrift, Lensa... (+8 more)
    Company Intel: 👥 3 with size data | 🏭 6 with industry data

🔄 San Antonio, TX - Hybrid: 61 jobs
    Companies: GovCIO, USAA, Modern Technology Solutions... (+10 more)
    Company Intel: 👥 3 with size data | 🏭 1 with industry data

🏢 San Antonio, Texas Metropolitan Area - On-site: 11 jobs
    Companies: Oteemo Inc., Mission Technologies,  a division of HII

🏠 San Antonio, Texas Metropolitan Area - Remote: 4 jobs
    Companies: Compri Consulting, Mission Technologies,  a division of HII

🏢 Lackland Air Force Base, TX - On-site: 3 jobs
    Companies: Knowesis Inc.

🏢 Universal City, TX - On-site: 2 jo

In [36]:
# 🔍 QUICK COMPANY INTELLIGENCE CHECK
print("🔍 CURRENT COMPANY INTELLIGENCE COVERAGE")
print("=" * 50)

with sqlite3.connect(db_path) as conn:
    # Get current state of company fields
    coverage_query = """
    SELECT 
        COUNT(*) as total_jobs,
        COUNT(CASE WHEN company_size IS NOT NULL AND company_size != '' THEN 1 END) as jobs_with_size,
        COUNT(CASE WHEN company_followers IS NOT NULL AND company_followers != '' THEN 1 END) as jobs_with_followers,
        COUNT(CASE WHEN company_industry IS NOT NULL AND company_industry != '' THEN 1 END) as jobs_with_industry
    FROM jobs
    """
    coverage_stats = pd.read_sql_query(coverage_query, conn).iloc[0]
    
    print(f"📊 Database-wide Company Intelligence:")
    total = coverage_stats["total_jobs"]
    print(f"   Total jobs: {total}")
    print(f"   👥 Company Size: {coverage_stats['jobs_with_size']} jobs ({coverage_stats['jobs_with_size']/total*100:.1f}%)")
    print(f"   📊 Company Followers: {coverage_stats['jobs_with_followers']} jobs ({coverage_stats['jobs_with_followers']/total*100:.1f}%)")
    print(f"   🏭 Company Industry: {coverage_stats['jobs_with_industry']} jobs ({coverage_stats['jobs_with_industry']/total*100:.1f}%)")
    
    # Show some examples of extracted company info
    sample_query = """
    SELECT company, company_size, company_followers, company_industry, title
    FROM jobs 
    WHERE (company_size IS NOT NULL AND company_size != '') 
       OR (company_followers IS NOT NULL AND company_followers != '')
       OR (company_industry IS NOT NULL AND company_industry != '')
    ORDER BY created_at DESC
    LIMIT 10
    """
    
    sample_companies = pd.read_sql_query(sample_query, conn)
    
    print(f"\n🏢 Examples of Company Intelligence:")
    for idx, row in sample_companies.iterrows():
        print(f"   {idx+1}. {row['company']}")
        if row['company_size']:
            print(f"      👥 Size: {row['company_size']}")
        if row['company_followers']:
            print(f"      📊 Followers: {row['company_followers']}")
        if row['company_industry']:
            print(f"      🏭 Industry: {row['company_industry']}")
        print(f"      Job: {row['title']}")
        print()

print(f"✨ The enhanced company parser successfully extracted information!")
print(f"💡 To improve coverage further, run: make fix-company-info")

🔍 CURRENT COMPANY INTELLIGENCE COVERAGE
📊 Database-wide Company Intelligence:
   Total jobs: 238
   👥 Company Size: 21 jobs (8.8%)
   📊 Company Followers: 21 jobs (8.8%)
   🏭 Company Industry: 19 jobs (8.0%)

🏢 Examples of Company Intelligence:
   1. CyrusOne
      👥 Size: 822 employees
      📊 Followers: 52,862 followers
      🏭 Industry: CyrusOne
      Job: Senior Data Center Capacity Engineer

   2. Amazon Web Services (AWS)
      👥 Size: 10,001+ employees
      📊 Followers: 10,274,592 followers
      🏭 Industry: IT Services and IT Consulting
      Job: Cleared Data Center Mechanical Field Engineer, ADC Field Engineering

   3. Aha!
      👥 Size: 51-200 employees
      📊 Followers: 116,162 followers
      🏭 Industry: Software Development
      Job: Sr. Security Engineer (Ruby on Rails experience required)

   4. Aha!
      👥 Size: 51-200 employees
      📊 Followers: 116,162 followers
      🏭 Industry: Software Development
      Job: Sr. Platform Engineer

   5. Jobs via Dice
      👥

In [21]:
# 📊 EXPORT & DATA VALIDATION
print("📤 CSV EXPORT WITH ENHANCED DATA + COMPANY INTELLIGENCE")
print("=" * 55)

# Export current job data to CSV in the main data folder
csv_filename = db.export_jobs_to_csv("../data/notebook_analysis_export.csv")
print(f"✅ Jobs exported to: {csv_filename}")

# Validate the exported CSV structure
if csv_filename:
    import pandas as pd

    exported_df = pd.read_csv(csv_filename)

    print(f"\n📋 Export Validation:")
    print(f"   Shape: {exported_df.shape}")
    print(f"   Columns: {exported_df.shape[1]} (should be 20)")

    expected_columns = [
        "id",
        "company",
        "company_size",
        "company_followers",
        "company_industry",
        "title",
        "location",
        "work_location_type",
        "level",
        "salary_range",
        "content",
        "employment_type",
        "job_function",
        "industries",
        "posted_time",
        "applicants",
        "job_id",
        "date",
        "parsing_link",
        "job_posting_link",
    ]

    print(f"\n✅ Column Validation:")
    missing_cols = set(expected_columns) - set(exported_df.columns)
    extra_cols = set(exported_df.columns) - set(expected_columns)

    if not missing_cols and not extra_cols:
        print("   🎯 Perfect! All 20 expected columns present")
    else:
        if missing_cols:
            print(f"   ⚠️  Missing columns: {missing_cols}")
        if extra_cols:
            print(f"   ➕ Extra columns: {extra_cols}")

    print(f"\n📊 Data Quality Check:")
    print(
        f"   Location data: {exported_df['location'].notna().sum()}/{len(exported_df)} jobs ({exported_df['location'].notna().sum()/len(exported_df)*100:.1f}%)"
    )
    print(
        f"   Work type data: {exported_df['work_location_type'].notna().sum()}/{len(exported_df)} jobs ({exported_df['work_location_type'].notna().sum()/len(exported_df)*100:.1f}%)"
    )
    print(
        f"   Company data: {exported_df['company'].notna().sum()}/{len(exported_df)} jobs"
    )
    print(
        f"   Company size: {exported_df['company_size'].notna().sum()}/{len(exported_df)} jobs ({exported_df['company_size'].notna().sum()/len(exported_df)*100:.1f}%)"
    )
    print(
        f"   Company followers: {exported_df['company_followers'].notna().sum()}/{len(exported_df)} jobs ({exported_df['company_followers'].notna().sum()/len(exported_df)*100:.1f}%)"
    )
    print(
        f"   Company industry: {exported_df['company_industry'].notna().sum()}/{len(exported_df)} jobs ({exported_df['company_industry'].notna().sum()/len(exported_df)*100:.1f}%)"
    )
    print(
        f"   Title data: {exported_df['title'].notna().sum()}/{len(exported_df)} jobs"
    )

    print(
        f"\n🎉 SUCCESS: Enhanced LinkedIn parser with location & company intelligence is working perfectly!"
    )
    print(f"   💾 Database: data/jobs.db")
    print(f"   📤 Export: {csv_filename}")
    print(f"   🎯 Use: make run-parser (to collect more jobs with company info)")

print(f"\n" + "=" * 60)
print(
    "🚀 ANALYSIS COMPLETE - Enhanced LinkedIn Parser with Company Intelligence Ready!"
)
print("=" * 60)

📤 CSV EXPORT WITH ENHANCED DATA + COMPANY INTELLIGENCE
✅ Jobs exported to: ../data/notebook_analysis_export.csv

📋 Export Validation:
   Shape: (230, 20)
   Columns: 20 (should be 20)

✅ Column Validation:
   🎯 Perfect! All 20 expected columns present

📊 Data Quality Check:
   Location data: 220/230 jobs (95.7%)
   Work type data: 220/230 jobs (95.7%)
   Company data: 230/230 jobs
   Company size: 6/230 jobs (2.6%)
   Company followers: 6/230 jobs (2.6%)
   Company industry: 0/230 jobs (0.0%)
   Title data: 230/230 jobs

🎉 SUCCESS: Enhanced LinkedIn parser with location & company intelligence is working perfectly!
   💾 Database: data/jobs.db
   📤 Export: ../data/notebook_analysis_export.csv
   🎯 Use: make run-parser (to collect more jobs with company info)

🚀 ANALYSIS COMPLETE - Enhanced LinkedIn Parser with Company Intelligence Ready!


In [22]:
# 🔄 RUN PARSER + CLEANER BACK TO BACK
print("🚀 RUNNING PARSER + DATA CLEANER PIPELINE")
print("=" * 50)

import subprocess
import time

# Step 1: Run the parser to collect fresh job data
print("📥 Step 1: Running LinkedIn Parser...")
print("Command: make run-parser")
try:
    parser_result = subprocess.run(
        ["make", "run-parser"],
        cwd=project_root,
        capture_output=True,
        text=True,
        timeout=300,  # 5 minute timeout
    )

    if parser_result.returncode == 0:
        print("✅ Parser completed successfully!")
        # Extract some stats from output if available
        lines = parser_result.stdout.split("\n")
        for line in lines[-10:]:  # Show last 10 lines
            if line.strip() and (
                "saved" in line.lower()
                or "exported" in line.lower()
                or "jobs" in line.lower()
            ):
                print(f"   {line.strip()}")
    else:
        print(f"⚠️ Parser completed with warnings:")
        print(f"   Return code: {parser_result.returncode}")
        if parser_result.stderr:
            print(f"   Error: {parser_result.stderr[-500:]}")  # Last 500 chars

except subprocess.TimeoutExpired:
    print("⏰ Parser timeout after 5 minutes")
except Exception as e:
    print(f"❌ Parser error: {e}")

# Small delay between operations
time.sleep(2)

# Step 2: Run the data cleaner on the fresh data
print(f"\n🧹 Step 2: Running Data Cleaner...")
print("Command: python -m genai_job_finder.data_cleaner.run_graph")
try:
    cleaner_result = subprocess.run(
        [
            "/home/alireza/.cache/pypoetry/virtualenvs/genai-job-finder-Y_k-9c-5-py3.12/bin/python",
            "-m",
            "genai_job_finder.data_cleaner.run_graph",
            "--db-path",
            "data/jobs.db",
            "--verbose",
        ],
        cwd=project_root,
        capture_output=True,
        text=True,
        timeout=600,  # 10 minute timeout for AI processing
    )

    if cleaner_result.returncode == 0:
        print("✅ Data cleaner completed successfully!")
        # Extract processing summary
        lines = cleaner_result.stdout.split("\n")
        in_summary = False
        for line in lines:
            if "PROCESSING SUMMARY" in line:
                in_summary = True
                print(f"\n📊 {line}")
            elif in_summary and ("=" in line or line.strip() == ""):
                if "=" in line:
                    print(line)
                    in_summary = False
            elif in_summary:
                print(f"   {line}")
    else:
        print(f"⚠️ Data cleaner completed with issues:")
        print(f"   Return code: {cleaner_result.returncode}")
        if cleaner_result.stderr:
            print(f"   Error: {cleaner_result.stderr[-500:]}")

except subprocess.TimeoutExpired:
    print("⏰ Data cleaner timeout after 10 minutes")
except Exception as e:
    print(f"❌ Data cleaner error: {e}")

print(f"\n🎯 Pipeline Complete!")
print("   📥 Fresh job data collected")
print("   🧹 AI-powered data cleaning applied")
print("   💾 Results available in cleaned_jobs table")
print("   📊 Ready for enhanced analysis below ⬇️")

🚀 RUNNING PARSER + DATA CLEANER PIPELINE
📥 Step 1: Running LinkedIn Parser...
Command: make run-parser
✅ Parser completed successfully!
   ✅ Successfully parsed 8 jobs
   📊 Jobs exported to: data/jobs_export.csv

🧹 Step 2: Running Data Cleaner...
Command: python -m genai_job_finder.data_cleaner.run_graph


KeyboardInterrupt: 

In [None]:
# 🧹 CLEANED JOBS TABLE ANALYSIS
print("✨ ANALYZING AI-CLEANED JOB DATA WITH COMPANY INTELLIGENCE")
print("=" * 65)

with sqlite3.connect(db_path) as conn:
    # Check if cleaned_jobs table exists
    tables_query = (
        "SELECT name FROM sqlite_master WHERE type='table' AND name='cleaned_jobs'"
    )
    table_exists = pd.read_sql_query(tables_query, conn)

    if table_exists.empty:
        print("❌ No cleaned_jobs table found.")
        print("💡 Run the cell above to execute the parser + cleaner pipeline first.")
    else:
        print("✅ Cleaned jobs table found!")

        # Get basic stats
        total_cleaned = pd.read_sql_query(
            "SELECT COUNT(*) as count FROM cleaned_jobs", conn
        ).iloc[0]["count"]
        print(f"📊 Total cleaned jobs: {total_cleaned}")

        if total_cleaned > 0:
            # Get the schema of cleaned table
            schema_query = "PRAGMA table_info(cleaned_jobs)"
            schema_df = pd.read_sql_query(schema_query, conn)
            print(f"🏗️ Table structure: {len(schema_df)} columns")

            # Sample of cleaned data with company information
            sample_query = """
            SELECT 
                id, company, company_size, company_followers, company_industry,
                title, location, 
                min_years_experience, experience_level_label,
                work_location_type, employment_type,
                min_salary, max_salary, mid_salary, content
            FROM cleaned_jobs 
            ORDER BY id DESC 
            LIMIT 10
            """

            cleaned_sample = pd.read_sql_query(sample_query, conn)

            print(f"\n📋 SAMPLE CLEANED JOBS WITH COMPANY INTELLIGENCE:")
            print("-" * 70)
            for idx, job in cleaned_sample.iterrows():
                print(f"{idx+1:2d}. {job['title']} at {job['company']}")
                print(f"    📍 {job['location']}")

                # NEW: Company information display
                company_details = []
                if pd.notna(job["company_size"]) and job["company_size"]:
                    company_details.append(f"👥 {job['company_size']} employees")
                if pd.notna(job["company_followers"]) and job["company_followers"]:
                    company_details.append(f"📊 {job['company_followers']} followers")
                if pd.notna(job["company_industry"]) and job["company_industry"]:
                    company_details.append(f"🏭 {job['company_industry']}")

                if company_details:
                    print(f"    🏢 {' | '.join(company_details)}")

                # Experience info
                if pd.notna(job["min_years_experience"]) and pd.notna(
                    job["experience_level_label"]
                ):
                    print(
                        f"    🎯 Experience: {job['min_years_experience']} years → {job['experience_level_label']}"
                    )

                # Salary info
                if pd.notna(job["min_salary"]) and pd.notna(job["max_salary"]):
                    print(
                        f"    💰 Salary: ${job['min_salary']:,.0f} - ${job['max_salary']:,.0f} (Mid: ${job['mid_salary']:,.0f})"
                    )

                # Work details
                work_details = []
                if pd.notna(job["work_location_type"]):
                    work_emoji = {"Remote": "🏠", "Hybrid": "🔄", "On-site": "🏢"}.get(
                        job["work_location_type"], "📍"
                    )
                    work_details.append(f"{work_emoji} {job['work_location_type']}")
                if pd.notna(job["employment_type"]):
                    work_details.append(job["employment_type"])
                if work_details:
                    print(f"    📝 {' | '.join(work_details)}")
                print()

cleaned_sample

In [None]:
# 📊🔄 BEFORE vs AFTER: Data Transformation Analysis with Company Intelligence
print("🔄 ORIGINAL vs AI-CLEANED DATA COMPARISON (WITH COMPANY INTELLIGENCE)")
print("=" * 75)

with sqlite3.connect(db_path) as conn:
    # Check if both tables exist
    original_exists = (
        pd.read_sql_query("SELECT COUNT(*) as count FROM jobs", conn).iloc[0]["count"]
        > 0
    )
    cleaned_exists = (
        len(
            pd.read_sql_query(
                "SELECT name FROM sqlite_master WHERE type='table' AND name='cleaned_jobs'",
                conn,
            )
        )
        > 0
    )

    if not cleaned_exists:
        print("❌ Need cleaned data for comparison")
        print("💡 Run: make run-pipeline")
    elif not original_exists:
        print("❌ No original data found")
    else:
        cleaned_count = pd.read_sql_query(
            "SELECT COUNT(*) as count FROM cleaned_jobs", conn
        ).iloc[0]["count"]

        if cleaned_count == 0:
            print("📭 Cleaned table is empty")
            print("💡 Run: make run-cleaner")
        else:
            print("📊 DATA TRANSFORMATION PIPELINE RESULTS WITH COMPANY INTELLIGENCE:")
            print("-" * 60)

            # Side-by-side comparison of same jobs including company info
            comparison_query = """
            SELECT 
                o.id,
                o.company,
                o.company_size,
                o.company_followers,
                o.company_industry,
                o.title,
                o.location,
                o.level as original_level,
                o.salary_range as original_salary,
                o.employment_type as original_employment,
                c.min_years_experience as ai_years,
                c.experience_level_label as ai_level,
                CASE 
                    WHEN c.min_salary IS NOT NULL THEN c.min_salary || ' - ' || c.max_salary || ' (Mid: ' || c.mid_salary || ')'
                    ELSE 'Not extracted'
                END as ai_salary,
                c.work_location_type as ai_work_type,
                c.employment_type as ai_employment
            FROM jobs o
            LEFT JOIN cleaned_jobs c ON o.id = c.id
            WHERE c.id IS NOT NULL
            ORDER BY o.id DESC
            LIMIT 5
            """

            comparison_df = pd.read_sql_query(comparison_query, conn)

            print("🔍 DETAILED TRANSFORMATION EXAMPLES WITH COMPANY INTELLIGENCE:")
            print("(Showing how AI enhanced the original data)")
            print()

            for idx, row in comparison_df.iterrows():
                print(f"📋 JOB {idx+1}: {row['title']} at {row['company']}")
                print(f"   📍 Location: {row['location']}")

                # NEW: Company intelligence display
                company_details = []
                if pd.notna(row["company_size"]) and row["company_size"]:
                    company_details.append(f"👥 {row['company_size']} employees")
                if pd.notna(row["company_followers"]) and row["company_followers"]:
                    company_details.append(f"📊 {row['company_followers']} followers")
                if pd.notna(row["company_industry"]) and row["company_industry"]:
                    company_details.append(f"🏭 {row['company_industry']}")

                if company_details:
                    print(f"   🏢 Company Intel: {' | '.join(company_details)}")
                print()

                # Experience comparison
                print("   🎯 EXPERIENCE ANALYSIS:")
                print(f"      Original: '{row['original_level'] or 'Not specified'}'")
                print(f"      AI Result: {row['ai_years']} years → {row['ai_level']}")
                print()

                # Salary comparison
                print("   💰 SALARY INTELLIGENCE:")
                print(f"      Original: '{row['original_salary'] or 'Not specified'}'")
                print(f"      AI Result: {row['ai_salary']}")
                print()

                # Employment type comparison
                print("   📝 EMPLOYMENT TYPE:")
                print(
                    f"      Original: '{row['original_employment'] or 'Not specified'}'"
                )
                print(
                    f"      AI Result: {row['ai_employment']} | Work Type: {row['ai_work_type']}"
                )
                print()
                print("-" * 60)

            # Statistical improvements including company intelligence
            print("📈 STATISTICAL IMPROVEMENTS WITH COMPANY INTELLIGENCE:")
            print("-" * 50)

            # Count improvements
            improvements_query = """
            SELECT 
                COUNT(*) as total_jobs,
                -- Experience data
                COUNT(CASE WHEN o.level IS NOT NULL AND o.level != '' THEN 1 END) as original_exp_data,
                COUNT(CASE WHEN c.experience_level_label IS NOT NULL THEN 1 END) as ai_exp_data,
                -- Salary data  
                COUNT(CASE WHEN o.salary_range IS NOT NULL AND o.salary_range != '' THEN 1 END) as original_salary_data,
                COUNT(CASE WHEN c.min_salary IS NOT NULL THEN 1 END) as ai_salary_data,
                -- Work location data
                COUNT(CASE WHEN c.work_location_type IS NOT NULL THEN 1 END) as ai_work_type_data,
                -- Company intelligence data (already in original)
                COUNT(CASE WHEN o.company_size IS NOT NULL THEN 1 END) as company_size_data,
                COUNT(CASE WHEN o.company_followers IS NOT NULL THEN 1 END) as company_followers_data,
                COUNT(CASE WHEN o.company_industry IS NOT NULL THEN 1 END) as company_industry_data
            FROM jobs o
            LEFT JOIN cleaned_jobs c ON o.id = c.id
            WHERE c.id IS NOT NULL
            """

            improvements_stats = pd.read_sql_query(improvements_query, conn).iloc[0]
            total = improvements_stats["total_jobs"]

            print(f"🎯 Experience Data:")
            print(
                f"   Before: {improvements_stats['original_exp_data']}/{total} jobs ({improvements_stats['original_exp_data']/total*100:.1f}%)"
            )
            print(
                f"   After:  {improvements_stats['ai_exp_data']}/{total} jobs ({improvements_stats['ai_exp_data']/total*100:.1f}%)"
            )
            exp_improvement = (
                improvements_stats["ai_exp_data"]
                - improvements_stats["original_exp_data"]
            )
            print(
                f"   Gain:   +{exp_improvement} jobs (+{exp_improvement/total*100:.1f}%)"
            )
            print()

            print(f"💰 Salary Data:")
            print(
                f"   Before: {improvements_stats['original_salary_data']}/{total} jobs ({improvements_stats['original_salary_data']/total*100:.1f}%)"
            )
            print(
                f"   After:  {improvements_stats['ai_salary_data']}/{total} jobs ({improvements_stats['ai_salary_data']/total*100:.1f}%)"
            )
            salary_improvement = (
                improvements_stats["ai_salary_data"]
                - improvements_stats["original_salary_data"]
            )
            print(
                f"   Gain:   +{salary_improvement} jobs (+{salary_improvement/total*100:.1f}%)"
            )
            print()

            print(f"🏠 Work Location Type (New):")
            print(f"   Before: 0/{total} jobs (0.0%) - Not available in original")
            print(
                f"   After:  {improvements_stats['ai_work_type_data']}/{total} jobs ({improvements_stats['ai_work_type_data']/total*100:.1f}%)"
            )
            print(
                f"   Gain:   +{improvements_stats['ai_work_type_data']} jobs (NEW FEATURE)"
            )
            print()

            # NEW: Company intelligence summary
            print(f"🏢 Company Intelligence (Integrated in Parser):")
            print(
                f"   Company Size:     {improvements_stats['company_size_data']}/{total} jobs ({improvements_stats['company_size_data']/total*100:.1f}%)"
            )
            print(
                f"   Company Followers: {improvements_stats['company_followers_data']}/{total} jobs ({improvements_stats['company_followers_data']/total*100:.1f}%)"
            )
            print(
                f"   Company Industry:  {improvements_stats['company_industry_data']}/{total} jobs ({improvements_stats['company_industry_data']/total*100:.1f}%)"
            )
            print(
                "   💡 Company data extracted during parsing phase, available in both tables"
            )