In [None]:
import pandas as pd
import re

# 1. Load Data
df = pd.read_json('jobs.jsonl', lines=True)

# 2. Function to clean Salary
def clean_salary(salary_str):
    if pd.isna(salary_str) or salary_str == 'None':
        return None, None

    # Lowercase & remove noise
    txt = str(salary_str).lower().replace('.', '').replace(',', '').replace('rp', '').replace('per month', '').strip()

    # Helper to convert "10jt" or "10m" to number
    def parse_num(num_str):
        num_str = num_str.strip()
        multiplier = 1
        if 'jt' in num_str or 'juta' in num_str:
            multiplier = 1_000_000
            num_str = num_str.replace('jt', '').replace('juta', '')
        elif 'm' in num_str: # Assuming M = Million in this context context/juta
            multiplier = 1_000_000
            num_str = num_str.replace('m', '')
        elif 'k' in num_str:
            multiplier = 1_000
            num_str = num_str.replace('k', '')

        try:
            return int(float(num_str) * multiplier)
        except:
            return None

    # Split range (handling hyphen "-" and en-dash "‚Äì")
    if '‚Äì' in txt:
        parts = txt.split('‚Äì')
    elif '-' in txt:
        parts = txt.split('-')
    else:
        # Single value salary
        val = parse_num(txt)
        return val, val

    # Return Min and Max
    min_sal = parse_num(parts[0])
    max_sal = parse_num(parts[1])
    return min_sal, max_sal

# Apply function
df[['min_salary', 'max_salary']] = df['salary'].apply(lambda x: pd.Series(clean_salary(x)))

# 3. Standardize Location (Example)
# Simplify "Jakarta Selatan, Jakarta Raya" -> "Jakarta Selatan"
def clean_location(loc):
    if pd.isna(loc): return "Unknown"
    return loc.split(',')[0].strip() # Takes the first part only

df['clean_location'] = df['location'].apply(clean_location)

# Check result
print(df[['salary', 'min_salary', 'max_salary', 'clean_location']].head())

# Save for Step 2
df.to_csv('jobs_clean.csv', index=False)

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from langchain_openai import OpenAIEmbeddings
import pandas as pd
import time

# 1. Setup Keys & Config
QDRANT_URL = "https://f9e2d66a-f7ec-4675-b665-a39f07bd792e.us-east4-0.gcp.cloud.qdrant.io:6333"
QDRANT_API_KEY = ""
OPENAI_API_KEY = ""
COLLECTION_NAME = "job_market"

# 2. Inisialisasi Client
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# 3. Reset & Buat Collection Baru (Manual)
# Vector size 1536 adalah standar untuk model "text-embedding-3-small"
print(f"üõ†Ô∏è Membuat collection '{COLLECTION_NAME}'...")
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

# 4. Persiapan Data (Batching)
df = pd.read_csv('jobs_clean.csv')
texts = []
payloads = []
ids = []

print("üì¶ Menyiapkan data...")
for index, row in df.iterrows():
    # Gabungkan teks
    text_content = f"""
    Job Title: {row['job_title']}
    Company: {row['company_name']}
    Location: {row['clean_location']}
    Description: {row['job_description']}
    """

    # Simpan teks dan metadata ke payload agar bisa dibaca nanti
    payload = {
        "page_content": text_content, # Penting: LangChain nanti mencari key ini
        "sql_id": index,
        "company": row['company_name'],
        "title": row['job_title']
    }

    texts.append(text_content)
    payloads.append(payload)
    ids.append(index) # Gunakan index dataframe sebagai ID point

# 5. Generate Embeddings (Menggunakan OpenAI)
print(f"üß† Sedang membuat embedding untuk {len(texts)} data... (Mohon tunggu, ini butuh koneksi internet)")
embeddings_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-small")

try:
    # Kita embed sekaligus (Batch) agar lebih cepat
    vectors = embeddings_model.embed_documents(texts)
    print("‚úÖ Embedding selesai!")
except Exception as e:
    print(f"‚ùå Error saat embedding: {e}")
    raise e

# 6. Upload ke Qdrant
print("üöÄ Mengupload ke Qdrant Cloud...")

# Convert ke format PointStruct yang diminta Qdrant
points = [
    models.PointStruct(id=idx, vector=vector, payload=payload)
    for idx, vector, payload in zip(ids, vectors, payloads)
]

# Upload dalam batch (chunking otomatis oleh client)
client.upload_points(
    collection_name=COLLECTION_NAME,
    points=points
)

print(f"üéâ SUKSES! {len(points)} data berhasil masuk ke Qdrant.")
print("Sekarang kamu bisa lanjut ke Sprint 2 (Coding Agent).")