# LinkedIn Job Posting Parsing for User Profile Development

In [31]:
import pandas as pd
import json
import os
from dotenv import load_dotenv
import random
import requests
from openai import OpenAI


In [32]:
# Load environment variables from .env file
load_dotenv()

# Get OpenAI API keyQ
openai_api_key = os.getenv('OPENAI_API_KEY')


if openai_api_key:
    print("OpenAI API key loaded successfully")
else:
    print("Warning: OPENAI_API_KEY not found in .env file")

oai_client = OpenAI(api_key=openai_api_key)

OpenAI API key loaded successfully


In [None]:
df_posting = pd.read_csv('../analysis/data/LinkedIn_scrapping/postings.csv')
print(f"Shape: {df_posting.shape}")

Shape: (123849, 31)


In [28]:
# Randomly select a job from df_posting
random_job = df_posting.sample(n=1).iloc[0]
job_id = str(random_job['job_id'])
description_text = random_job['description']

print(f"Selected job_id: {job_id}")
print(f"Description preview: {description_text[:200]}...")

# Construct the Response API request
request_body = {
    "model": "gpt-5-mini",
    "instructions": "You are an expert at parsing job descriptions. Your task is to separate the job description into two parts: 1) The job description (what the role is about, responsibilities, and what the position entails), and 2) The requirements (qualifications, skills, experience, and criteria needed for the job). Do not add any new language, simply parse the existing words into those two categories.",
    "input": f"Please parse the following job description and separate it into description and requirement:\n\n{description_text}",
    "text": {
        "format": {
            "type": "json_schema",
            "name": "job_parsing",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "description": {
                        "type": "string",
                        "description": "The core description of the job role, responsibilities, and what the position entails"
                    },
                    "requirement": {
                        "type": "string",
                        "description": "The requirements, qualifications, skills, and experience needed for the job"
                    }
                },
                "required": ["description", "requirement"],
                "additionalProperties": False
            }
        }
    },
    "max_output_tokens": 16000
}

# Make API call
url = "https://api.openai.com/v1/responses"
headers = {
    "Authorization": f"Bearer {openai_api_key}",
    "Content-Type": "application/json"
}

print("\nSending request to OpenAI Response API...")
response = requests.post(url, headers=headers, json=request_body)

if response.status_code == 200:
    result = response.json()
    print("\n✓ Success!")
    print(f"\nResponse ID: {result.get('id')}")
    print(f"Model: {result.get('model')}")
    print(f"Status: {result.get('status')}")
    
    # Check if response is incomplete
    if result.get('status') == 'incomplete':
        print(f"⚠️ Warning: Response incomplete - {result.get('incomplete_details', {}).get('reason')}")
    
    # Extract and display the parsed content
    if 'output' in result and len(result['output']) > 0:
        for output_item in result['output']:
            if output_item.get('type') == 'message' and 'content' in output_item:
                content_list = output_item['content']
                for content_item in content_list:
                    if content_item.get('type') == 'output_text' and 'text' in content_item:
                        output_text = content_item['text']
                        parsed_data = json.loads(output_text)
                        print("\n--- Parsed Job Description ---")
                        print(parsed_data.get('description', ''))
                        print("\n--- Parsed Requirements ---")
                        print(parsed_data.get('requirement', ''))
                        break
                break
        else:
            print("\n⚠️ No text content found in output")
            print("Output items:", [item.get('type') for item in result['output']])
    else:
        print("\n⚠️ No output found in response")
else:
    print(f"\n✗ Error: {response.status_code}")
    print(response.text)

Selected job_id: 3905882109
Description preview: Dice is the leading career destination for tech experts at every stage of their careers. Our client, Request Technology, LLC, is seeking the following. Apply via Dice today!

***We are unable to spons...

Sending request to OpenAI Response API...

✓ Success!

Response ID: resp_0f0d411ffbe9cea10069718538f0b0819b8f23863570bd473e
Model: gpt-5-mini-2025-08-07
Status: completed

--- Parsed Job Description ---
Dice is the leading career destination for tech experts at every stage of their careers. Our client, Request Technology, LLC, is seeking the following. Apply via Dice today!

***We are unable to sponsor as this is a permanent full-time role***

A prestigious company is looking for a Oracle Applications DBA Tech Lead/Manager . This is a hands-on tech lead/manager. They will focus on all Oracle ERP applications and will focus heavily on Oracle EBS (11i/R12). This company is looking for someone with heavy Oracle Fusion Cloud experience.

Re

In [34]:
# Create batch requests
batch_requests = []

for idx, row in df_posting.iterrows():
    job_id = str(row['job_id'])
    description_text = row['description']
    
    # Skip if description is empty or null
    if pd.isna(description_text) or not description_text:
        continue
    
    # Create the request using Response API format
    request = {
        "custom_id": job_id,
        "method": "POST",
        "url": "/v1/responses",
        "body": {
            "model": "gpt-5-mini",
            "instructions": "You are an expert at parsing job descriptions. Your task is to separate the job description into two parts: 1) The job description (what the role is about, responsibilities, and what the position entails), and 2) The requirements (qualifications, skills, experience, and criteria needed for the job). Do not add any new language, simply parse the existing words into those two categories.",
            "input": f"Please parse the following job description and separate it into description and requirement:\n\n{description_text}",
            "text": {
                "format": {
                    "type": "json_schema",
                    "name": "job_parsing",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "description": {
                                "type": "string",
                                "description": "The core description of the job role, responsibilities, and what the position entails"
                            },
                            "requirement": {
                                "type": "string",
                                "description": "The requirements, qualifications, skills, and experience needed for the job"
                            }
                        },
                        "required": ["description", "requirement"],
                        "additionalProperties": False
                    }
                }
            },
            "max_output_tokens": 16000
        }
    }
    
    batch_requests.append(request)

print(f"Total requests created: {len(batch_requests)}")

# Split into batches of max 30,000 each
batch_size = 30000
num_batches = (len(batch_requests) + batch_size - 1) // batch_size

batch_files = []
for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(batch_requests))
    
    output_file = f'openai_batch_requests_batch{batch_num + 1}.jsonl'
    with open(output_file, 'w', encoding='utf-8') as f:
        for request in batch_requests[start_idx:end_idx]:
            f.write(json.dumps(request) + '\n')
    
    batch_files.append(output_file)
    print(f"Created {output_file}: {end_idx - start_idx} requests")

print(f"\nTotal batch files: {len(batch_files)}")

Total requests created: 123842
Created openai_batch_requests_batch1.jsonl: 30000 requests
Created openai_batch_requests_batch2.jsonl: 30000 requests
Created openai_batch_requests_batch3.jsonl: 30000 requests
Created openai_batch_requests_batch4.jsonl: 30000 requests
Created openai_batch_requests_batch5.jsonl: 3842 requests

Total batch files: 5


In [36]:
# Submit batch files to OpenAI Batch API

batch_jobs = []

# only submit one for prototyping 
for batch_file in batch_files[0:1]:
    # Upload the batch input file
    with open(batch_file, 'rb') as f:
        batch_input_file = oai_client.files.create(
            file=f,
            purpose="batch"
        )
    
    # Create the batch job
    batch_job = oai_client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/responses",
        completion_window="24h",
        metadata={
            "description": f"Job description parsing for {batch_file}"
        }
    )
    
    batch_jobs.append({
        'file': batch_file,
        'file_id': batch_input_file.id,
        'batch_id': batch_job.id,
        'status': batch_job.status
    })

# Summary
print(f"{'='*80}")
print(f"Batch Jobs Summary - {len(batch_jobs)} batches submitted")
print(f"{'='*80}")
print(f"{'File':<45} {'File ID':<35} {'Batch ID'}")
print(f"{'-'*80}")

for job in batch_jobs:
    print(f"{job['file']:<45} {job['file_id']:<35} {job['batch_id']}")

Batch Jobs Summary - 1 batches submitted
File                                          File ID                             Batch ID
--------------------------------------------------------------------------------
openai_batch_requests_batch1.jsonl            file-4XNkmcJVft1RSFzjiGVAbU         batch_69718a39dd888190bca7db2fe207a090
